Add generic plugin architecture and watch-directory feature

Introduces a manifest contract so feature containers self-describe their settings (JSON Schema + access rules). Backend and frontend gain generic plugin proxy and dynamic Extensions UI with zero feature-specific code. Doc-service is the first plugin consumer: exposes /plugin/manifest and /plugin/settings, adds a watchdog-based file watcher that auto-ingests PDFs from a mounted directory, maps subfolders to categories, supports AI-suggested folder/filename (user-confirmed), and enforces a no-remove policy. Access is gated by is_superuser or doc-service-admin group. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 02:09:50 +02:00
parent 2d7207b62f
commit 00466a9801
29 changed files with 1373 additions and 52 deletions
@@ -1,15 +1,45 @@
+import asyncio
+import logging
+from contextlib import asynccontextmanager
+
 from fastapi import FastAPI

 from app.core.config import settings
 from app.routers import categories, documents
+from app.routers import plugin as plugin_router

-app = FastAPI(title=settings.PROJECT_NAME)
+logger = logging.getLogger(__name__)
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    loop = asyncio.get_running_loop()
+    watcher = None
+
+    try:
+        from app.services.config_reader import get_storage_config
+        storage_config = await get_storage_config()
+        if storage_config.get("watch_enabled"):
+            from app.services.file_watcher import FileWatcherService
+            watcher = FileWatcherService(loop)
+            await watcher.start(storage_config["watch_path"], storage_config)
+    except Exception as exc:
+        logger.warning("[doc-service] File watcher could not start: %s", exc)
+
+    yield
+
+    if watcher is not None:
+        await watcher.stop()
+
+
+app = FastAPI(title=settings.PROJECT_NAME, lifespan=lifespan)

 # No CORS — this service is only reachable from the main backend on backend-net.
 # All browser traffic goes through the main backend proxy.

 app.include_router(documents.router, prefix="/documents", tags=["documents"])
 app.include_router(categories.router, prefix="/categories", tags=["categories"])
+app.include_router(plugin_router.router, prefix="/plugin", tags=["plugin"])


@app.get("/health")
@@ -27,6 +27,12 @@ class Document(Base):
    )
    processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)

+    # Watch-directory ingestion fields (migration 0003)
+    source: Mapped[str] = mapped_column(String(16), nullable=False, default="upload")
+    watch_path: Mapped[str | None] = mapped_column(String, nullable=True)
+    suggested_folder: Mapped[str | None] = mapped_column(String(128), nullable=True)
+    suggested_filename: Mapped[str | None] = mapped_column(String(500), nullable=True)
+
    category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
        "CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
    )
@@ -5,6 +5,8 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession

+from sqlalchemy import or_
+
 from app.database import AsyncSessionLocal, get_db
 from app.deps import get_user_id
 from app.models.category import DocumentCategory
@@ -15,6 +17,9 @@ from app.services.ai_client import classify_document

 router = APIRouter()

+# Sentinel user_id for watch-ingested categories — must match documents.py
+_WATCH_USER_ID = "watch"
+
 _SIMILARITY_THRESHOLD = 0.4


@@ -81,9 +86,10 @@ async def list_categories(
    user_id: str = Depends(get_user_id),
    db: AsyncSession = Depends(get_db),
 ) -> list[DocumentCategory]:
+    # Include watch-ingested categories so they appear in the sidebar/filter
    result = await db.execute(
        select(DocumentCategory)
-        .where(DocumentCategory.user_id == user_id)
+        .where(or_(DocumentCategory.user_id == user_id, DocumentCategory.user_id == _WATCH_USER_ID))
        .order_by(DocumentCategory.name)
    )
    return result.scalars().all()
@@ -26,13 +26,21 @@ router = APIRouter()

 _DEFAULT_MAX_BYTES = 20 * 1024 * 1024

+# Sentinel user_id used for watch-directory-ingested documents.
+# These documents are visible to all authenticated users.
+_WATCH_USER_ID = "watch"
+

 # ── Helpers ───────────────────────────────────────────────────────────────────

 async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
+    """Fetch a document owned by user_id OR a watch-ingested document (visible to all)."""
    result = await db.execute(
        select(Document)
-        .where(Document.id == doc_id, Document.user_id == user_id)
+        .where(
+            Document.id == doc_id,
+            or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
+        )
        .options(
            selectinload(Document.category_assignments)
            .selectinload(CategoryAssignment.category)
@@ -61,6 +69,10 @@ def _doc_with_categories(doc: Document) -> DocumentOut:
        created_at=doc.created_at,
        processed_at=doc.processed_at,
        categories=cats,
+        source=doc.source,
+        watch_path=doc.watch_path,
+        suggested_folder=doc.suggested_folder,
+        suggested_filename=doc.suggested_filename,
    )


@@ -183,7 +195,8 @@ async def list_documents(
    sort_expr = sort_col.desc() if order == "desc" else sort_col.asc()

    # Build filter conditions once and reuse for both count + items queries.
-    conditions = [Document.user_id == user_id]
+    # Watch-ingested documents (user_id = "watch") are visible to all users.
+    conditions = [or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID)]
    if status:
        conditions.append(Document.status == status)
    if document_type:
@@ -247,7 +260,10 @@ async def get_document_status(
    db: AsyncSession = Depends(get_db),
 ) -> Document:
    result = await db.execute(
-        select(Document).where(Document.id == doc_id, Document.user_id == user_id)
+        select(Document).where(
+            Document.id == doc_id,
+            or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
+        )
    )
    doc = result.scalar_one_or_none()
    if doc is None:
@@ -347,7 +363,10 @@ async def download_file(
    db: AsyncSession = Depends(get_db),
 ) -> StreamingResponse:
    result = await db.execute(
-        select(Document).where(Document.id == doc_id, Document.user_id == user_id)
+        select(Document).where(
+            Document.id == doc_id,
+            or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
+        )
    )
    doc = result.scalar_one_or_none()
    if doc is None:
@@ -374,9 +393,12 @@ async def assign_category(
    user_id: str = Depends(get_user_id),
    db: AsyncSession = Depends(get_db),
 ) -> None:
-    # Verify both belong to this user
+    # Verify the document is accessible (own or watch-ingested)
    doc_result = await db.execute(
-        select(Document).where(Document.id == doc_id, Document.user_id == user_id)
+        select(Document).where(
+            Document.id == doc_id,
+            or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
+        )
    )
    if doc_result.scalar_one_or_none() is None:
        raise HTTPException(status_code=404, detail="Document not found")
@@ -418,3 +440,81 @@ async def remove_category(
    if assignment:
        await db.delete(assignment)
        await db.commit()
+
+
+# ── AI suggestion confirmation ────────────────────────────────────────────────
+# These endpoints allow users to confirm or reject AI suggestions on
+# watch-ingested documents.  No disk mutations — suggestions only update the DB.
+
+@router.post("/{doc_id}/suggestions/folder/confirm", status_code=204)
+async def confirm_folder_suggestion(
+    doc_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db),
+) -> None:
+    doc = await _get_user_doc(doc_id, user_id, db)
+    if not doc.suggested_folder:
+        raise HTTPException(status_code=400, detail="No folder suggestion pending")
+
+    # Find or create the suggested category under the watch sentinel user
+    cat_result = await db.execute(
+        select(DocumentCategory).where(
+            DocumentCategory.user_id == _WATCH_USER_ID,
+            DocumentCategory.name == doc.suggested_folder,
+        )
+    )
+    cat = cat_result.scalar_one_or_none()
+    if cat is None:
+        cat = DocumentCategory(user_id=_WATCH_USER_ID, name=doc.suggested_folder[:128])
+        db.add(cat)
+        await db.commit()
+        await db.refresh(cat)
+
+    # Assign if not already assigned
+    exists = await db.execute(
+        select(CategoryAssignment).where(
+            CategoryAssignment.document_id == doc_id,
+            CategoryAssignment.category_id == cat.id,
+        )
+    )
+    if exists.scalar_one_or_none() is None:
+        db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
+
+    doc.suggested_folder = None
+    await db.commit()
+
+
+@router.post("/{doc_id}/suggestions/folder/reject", status_code=204)
+async def reject_folder_suggestion(
+    doc_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db),
+) -> None:
+    doc = await _get_user_doc(doc_id, user_id, db)
+    doc.suggested_folder = None
+    await db.commit()
+
+
+@router.post("/{doc_id}/suggestions/filename/confirm", status_code=204)
+async def confirm_filename_suggestion(
+    doc_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db),
+) -> None:
+    doc = await _get_user_doc(doc_id, user_id, db)
+    if not doc.suggested_filename:
+        raise HTTPException(status_code=400, detail="No filename suggestion pending")
+    doc.title = doc.suggested_filename
+    doc.suggested_filename = None
+    await db.commit()
+
+
+@router.post("/{doc_id}/suggestions/filename/reject", status_code=204)
+async def reject_filename_suggestion(
+    doc_id: str,
+    user_id: str = Depends(get_user_id),
+    db: AsyncSession = Depends(get_db),
+) -> None:
+    doc = await _get_user_doc(doc_id, user_id, db)
+    doc.suggested_filename = None
+    await db.commit()
@@ -0,0 +1,97 @@
+"""
+Plugin manifest and settings endpoints for doc-service.
+
+These are internal-only — they are called by the main backend's generic plugin
+proxy, never directly by the browser.  No authentication is applied here because
+the backend enforces access control before forwarding the request.
+
+Endpoints:
+  GET  /plugin/manifest   → static manifest with JSON Schema for settings
+  GET  /plugin/settings   → current storage config values
+  PATCH /plugin/settings  → update storage config (partial update)
+"""
+from fastapi import APIRouter
+from pydantic import BaseModel
+
+from app.services.config_reader import get_storage_config, save_storage_config
+
+router = APIRouter()
+
+_MANIFEST: dict = {
+    "id": "doc-service",
+    "name": "Document Service",
+    "icon": "file-text",
+    "version": "1.0",
+    "access": {
+        "allow_superuser": True,
+        "required_groups": ["doc-service-admin"],
+    },
+    "settings_schema": {
+        "type": "object",
+        "title": "Storage & Watch",
+        "properties": {
+            "watch_enabled": {
+                "type": "boolean",
+                "title": "Enable file watching",
+                "description": (
+                    "Automatically ingest PDF files added to the mounted watch directory. "
+                    "Requires a service restart to take effect after toggling."
+                ),
+            },
+            "watch_path": {
+                "type": "string",
+                "title": "Watch path",
+                "readOnly": True,
+                "description": "Configured via Docker volume mount — edit docker-compose to change.",
+            },
+            "ai_folder_suggestion": {
+                "type": "boolean",
+                "title": "AI folder suggestion",
+                "description": (
+                    "AI suggests a category for each ingested document. "
+                    "You must confirm the suggestion before it is applied."
+                ),
+            },
+            "ai_folder_default": {
+                "type": "string",
+                "title": "Default import category",
+                "description": "Category assigned automatically when AI folder suggestion is disabled.",
+            },
+            "ai_rename_suggestion": {
+                "type": "boolean",
+                "title": "AI rename suggestion",
+                "description": (
+                    "AI suggests a document title for each ingested file. "
+                    "You must confirm before it is applied."
+                ),
+            },
+        },
+    },
+}
+
+
+class StorageSettingsUpdate(BaseModel):
+    watch_enabled: bool | None = None
+    ai_folder_suggestion: bool | None = None
+    ai_folder_default: str | None = None
+    ai_rename_suggestion: bool | None = None
+    # watch_path is intentionally excluded — it cannot be changed via API
+
+
+@router.get("/manifest")
+async def get_manifest() -> dict:
+    return _MANIFEST
+
+
+@router.get("/settings")
+async def get_settings() -> dict:
+    return await get_storage_config()
+
+
+@router.patch("/settings")
+async def update_settings(body: StorageSettingsUpdate) -> dict:
+    update = body.model_dump(exclude_none=True)
+    if "ai_folder_default" in update:
+        update["ai_folder_default"] = update["ai_folder_default"][:128].strip() or "imports"
+    await save_storage_config(update)
+    return await get_storage_config()
@@ -23,6 +23,10 @@ class DocumentOut(BaseModel):
    created_at: datetime
    processed_at: datetime | None
    categories: list[CategoryOut] = []
+    source: str = "upload"
+    watch_path: str | None = None
+    suggested_folder: str | None = None
+    suggested_filename: str | None = None

    model_config = {"from_attributes": True}

@@ -14,6 +14,14 @@ from pathlib import Path

 from app.core.config import settings

+_DEFAULT_STORAGE_CONFIG: dict = {
+    "watch_enabled": False,
+    "watch_path": "/data/watch",
+    "ai_folder_suggestion": False,
+    "ai_folder_default": "imports",
+    "ai_rename_suggestion": False,
+}
+
 _DEFAULT_SYSTEM_PROMPT = (
    "You are a financial document analysis assistant. "
    "Given the text extracted from a PDF document, return ONLY a JSON object "
@@ -43,6 +51,7 @@ _DEFAULT_USER_TEMPLATE = (

 _DEFAULT_CONFIG: dict = {
    "documents": {"max_pdf_bytes": 20 * 1024 * 1024},
+    "storage": _DEFAULT_STORAGE_CONFIG,
    "system_prompts": {
        "system": _DEFAULT_SYSTEM_PROMPT,
        "user_template": _DEFAULT_USER_TEMPLATE,
@@ -64,6 +73,25 @@ def _read_config_sync() -> dict:
    return _apply_env_overrides(base)


+def _read_config_sync_raw() -> dict:
+    """Read without env overrides — used when we need to write back to disk."""
+    path = Path(settings.CONFIG_PATH)
+    if not path.exists():
+        return deepcopy(_DEFAULT_CONFIG)
+    with open(path) as f:
+        return json.load(f)
+
+
+def _write_config_sync(config: dict) -> None:
+    """Atomically write config JSON to disk."""
+    path = Path(settings.CONFIG_PATH)
+    tmp = path.with_suffix(".tmp")
+    tmp.parent.mkdir(parents=True, exist_ok=True)
+    with open(tmp, "w") as f:
+        json.dump(config, f, indent=2)
+    os.replace(tmp, path)
+
+
 def _apply_env_overrides(config: dict) -> dict:
    cfg = deepcopy(config)
    docs = cfg.setdefault("documents", {})
@@ -84,3 +112,22 @@ async def load_doc_config() -> dict:
    _cache = data
    _cache_at = now
    return data
+
+
+async def get_storage_config() -> dict:
+    """Return storage config block, filling in defaults for any missing keys."""
+    config = await load_doc_config()
+    result = deepcopy(_DEFAULT_STORAGE_CONFIG)
+    result.update(config.get("storage", {}))
+    return result
+
+
+async def save_storage_config(data: dict) -> None:
+    """Merge data into the storage config block and persist to disk."""
+    global _cache, _cache_at
+    raw = await asyncio.to_thread(_read_config_sync_raw)
+    raw.setdefault("storage", {}).update(data)
+    await asyncio.to_thread(_write_config_sync, raw)
+    # Invalidate cache so next read picks up the new values
+    _cache = None
+    _cache_at = 0.0
@@ -0,0 +1,256 @@
+"""
+File-system watcher for the watch directory.
+
+Uses the watchdog library to monitor a configured directory for new PDF files.
+When a PDF is detected, it is automatically ingested into the document service
+(copied to /data/documents, a DB record is created, and the AI pipeline runs).
+
+Key design decisions:
+- No-remove policy: on_deleted and on_moved events are intentionally ignored.
+  The watcher never deletes, moves, or modifies files on the watched volume.
+- Watch documents use user_id="watch" as a sentinel so they are visible to
+  all authenticated users in the document list.
+- Subfolder names map to categories: a file at invoices/bill.pdf is assigned
+  to a "invoices" category (auto-created if needed).
+- Suggestions: if ai_folder_suggestion or ai_rename_suggestion are enabled,
+  the relevant fields are set on the document after AI processing so users
+  can confirm/reject from the UI.
+- Thread → async bridge: watchdog runs in a daemon thread; asyncio coroutines
+  are dispatched from that thread via run_coroutine_threadsafe.
+"""
+import asyncio
+import json
+import logging
+import uuid
+from pathlib import Path
+
+from watchdog.events import FileSystemEventHandler
+from watchdog.observers import Observer
+
+from app.database import AsyncSessionLocal
+from app.models.category import DocumentCategory
+from app.models.category_assignment import CategoryAssignment
+from app.models.document import Document
+from app.services.storage import save_upload
+
+logger = logging.getLogger(__name__)
+
+# Must match _WATCH_USER_ID in app/routers/documents.py
+WATCH_USER_ID = "watch"
+
+
+# ── Ingestion logic ───────────────────────────────────────────────────────────
+
+
+async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
+    """
+    Ingest a single PDF file from the watch directory.
+
+    Idempotent: skips files that already have a non-failed document record.
+    """
+    from sqlalchemy import select
+
+    path = Path(path_str)
+    if not path.exists() or not path.is_file():
+        return
+
+    async with AsyncSessionLocal() as db:
+        # Idempotency check — skip if already tracked (and not failed)
+        existing_result = await db.execute(
+            select(Document).where(Document.watch_path == path_str)
+        )
+        existing = existing_result.scalar_one_or_none()
+        if existing is not None and existing.status != "failed":
+            return
+
+        # Determine category from the first subfolder component
+        try:
+            rel = path.relative_to(watch_root)
+            folder_name = rel.parts[0] if len(rel.parts) > 1 else None
+        except ValueError:
+            folder_name = None
+
+        # Read file bytes
+        try:
+            file_data = path.read_bytes()
+        except OSError as exc:
+            logger.warning("[watcher] Cannot read %s: %s", path_str, exc)
+            return
+
+        # Save a copy to /data/documents/watch/{doc_id}.pdf
+        doc_id = existing.id if existing is not None else str(uuid.uuid4())
+        dest = await save_upload(file_data, WATCH_USER_ID, doc_id)
+
+        if existing is not None:
+            # Re-ingest a previously failed document
+            existing.file_path = str(dest)
+            existing.file_size = len(file_data)
+            existing.status = "pending"
+            existing.error_message = None
+            await db.commit()
+        else:
+            doc = Document(
+                id=doc_id,
+                user_id=WATCH_USER_ID,
+                source="watch",
+                watch_path=path_str,
+                filename=path.name,
+                file_path=str(dest),
+                file_size=len(file_data),
+                status="pending",
+            )
+            db.add(doc)
+            await db.commit()
+
+        # Auto-assign category from subfolder name
+        if folder_name:
+            cat_result = await db.execute(
+                select(DocumentCategory).where(
+                    DocumentCategory.user_id == WATCH_USER_ID,
+                    DocumentCategory.name == folder_name,
+                )
+            )
+            cat = cat_result.scalar_one_or_none()
+            if cat is None:
+                cat = DocumentCategory(user_id=WATCH_USER_ID, name=folder_name[:128])
+                db.add(cat)
+                await db.commit()
+                await db.refresh(cat)
+
+            exists_assign = await db.execute(
+                select(CategoryAssignment).where(
+                    CategoryAssignment.document_id == doc_id,
+                    CategoryAssignment.category_id == cat.id,
+                )
+            )
+            if exists_assign.scalar_one_or_none() is None:
+                db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
+                await db.commit()
+
+    # Run AI pipeline (opens its own session internally)
+    from app.routers.documents import process_document
+    await process_document(doc_id)
+
+    # Set AI suggestions if enabled
+    if config.get("ai_folder_suggestion") or config.get("ai_rename_suggestion"):
+        await _apply_suggestions(doc_id, config)
+
+
+async def _apply_suggestions(doc_id: str, config: dict) -> None:
+    """Populate suggested_folder / suggested_filename after AI processing."""
+    from sqlalchemy import select
+
+    async with AsyncSessionLocal() as db:
+        result = await db.execute(select(Document).where(Document.id == doc_id))
+        doc = result.scalar_one_or_none()
+        if doc is None or doc.status != "done" or not doc.extracted_data:
+            return
+
+        try:
+            extracted = json.loads(doc.extracted_data)
+        except Exception:
+            return
+
+        changed = False
+        if config.get("ai_folder_suggestion"):
+            suggestions = extracted.get("suggested_categories", [])
+            if suggestions:
+                doc.suggested_folder = str(suggestions[0])[:128]
+                changed = True
+
+        if config.get("ai_rename_suggestion"):
+            title = extracted.get("title")
+            if title:
+                doc.suggested_filename = str(title)[:500]
+                changed = True
+
+        if changed:
+            await db.commit()
+
+
+# ── Watchdog event handler ────────────────────────────────────────────────────
+
+
+class _PdfEventHandler(FileSystemEventHandler):
+    def __init__(
+        self,
+        watch_root: Path,
+        loop: asyncio.AbstractEventLoop,
+        config: dict,
+    ) -> None:
+        super().__init__()
+        self._watch_root = watch_root
+        self._loop = loop
+        self._config = config
+
+    def _dispatch_ingest(self, path_str: str) -> None:
+        if path_str.lower().endswith(".pdf"):
+            asyncio.run_coroutine_threadsafe(
+                ingest_file(path_str, self._watch_root, self._config),
+                self._loop,
+            )
+
+    def on_created(self, event):  # type: ignore[override]
+        if not event.is_directory:
+            self._dispatch_ingest(event.src_path)
+
+    def on_moved(self, event):  # type: ignore[override]
+        # Handles atomic rename/move (e.g. Nextcloud or Syncthing completing a sync)
+        if not event.is_directory:
+            self._dispatch_ingest(event.dest_path)
+
+    # on_deleted / on_modified: intentionally not overridden — no-remove policy
+
+
+# ── Service ───────────────────────────────────────────────────────────────────
+
+
+class FileWatcherService:
+    """Manages the watchdog Observer lifecycle within the FastAPI lifespan."""
+
+    def __init__(self, loop: asyncio.AbstractEventLoop) -> None:
+        self._loop = loop
+        self._observer: Observer | None = None
+        self._watch_root: Path | None = None
+        self._config: dict = {}
+
+    async def start(self, watch_path: str, config: dict) -> None:
+        self._watch_root = Path(watch_path)
+        self._config = config
+
+        if not self._watch_root.exists():
+            logger.warning(
+                "[watcher] Watch path %s does not exist — file watching disabled",
+                watch_path,
+            )
+            return
+
+        handler = _PdfEventHandler(self._watch_root, self._loop, config)
+        self._observer = Observer()
+        self._observer.schedule(handler, watch_path, recursive=True)
+        self._observer.start()
+        logger.info("[watcher] started, watching %s", watch_path)
+
+        # Run startup scan as a background task so startup is not blocked
+        asyncio.create_task(self._scan_existing())
+
+    async def _scan_existing(self) -> None:
+        """Ingest any PDFs already present in the watch directory."""
+        if self._watch_root is None:
+            return
+        logger.info("[watcher] scanning existing files in %s", self._watch_root)
+        count = 0
+        for pdf_path in sorted(self._watch_root.rglob("*.pdf")):
+            try:
+                await ingest_file(str(pdf_path), self._watch_root, self._config)
+                count += 1
+            except Exception as exc:
+                logger.warning("[watcher] scan error for %s: %s", pdf_path, exc)
+        logger.info("[watcher] startup scan complete — processed %d file(s)", count)
+
+    async def stop(self) -> None:
+        if self._observer is not None:
+            self._observer.stop()
+            await asyncio.to_thread(self._observer.join)
+            self._observer = None
+            logger.info("[watcher] stopped")