Add generic plugin architecture and watch-directory feature
Introduces a manifest contract so feature containers self-describe their settings (JSON Schema + access rules). Backend and frontend gain generic plugin proxy and dynamic Extensions UI with zero feature-specific code. Doc-service is the first plugin consumer: exposes /plugin/manifest and /plugin/settings, adds a watchdog-based file watcher that auto-ingests PDFs from a mounted directory, maps subfolders to categories, supports AI-suggested folder/filename (user-confirmed), and enforces a no-remove policy. Access is gated by is_superuser or doc-service-admin group. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,45 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from app.core.config import settings
|
||||
from app.routers import categories, documents
|
||||
from app.routers import plugin as plugin_router
|
||||
|
||||
app = FastAPI(title=settings.PROJECT_NAME)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
loop = asyncio.get_running_loop()
|
||||
watcher = None
|
||||
|
||||
try:
|
||||
from app.services.config_reader import get_storage_config
|
||||
storage_config = await get_storage_config()
|
||||
if storage_config.get("watch_enabled"):
|
||||
from app.services.file_watcher import FileWatcherService
|
||||
watcher = FileWatcherService(loop)
|
||||
await watcher.start(storage_config["watch_path"], storage_config)
|
||||
except Exception as exc:
|
||||
logger.warning("[doc-service] File watcher could not start: %s", exc)
|
||||
|
||||
yield
|
||||
|
||||
if watcher is not None:
|
||||
await watcher.stop()
|
||||
|
||||
|
||||
app = FastAPI(title=settings.PROJECT_NAME, lifespan=lifespan)
|
||||
|
||||
# No CORS — this service is only reachable from the main backend on backend-net.
|
||||
# All browser traffic goes through the main backend proxy.
|
||||
|
||||
app.include_router(documents.router, prefix="/documents", tags=["documents"])
|
||||
app.include_router(categories.router, prefix="/categories", tags=["categories"])
|
||||
app.include_router(plugin_router.router, prefix="/plugin", tags=["plugin"])
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
|
||||
@@ -27,6 +27,12 @@ class Document(Base):
|
||||
)
|
||||
processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
|
||||
# Watch-directory ingestion fields (migration 0003)
|
||||
source: Mapped[str] = mapped_column(String(16), nullable=False, default="upload")
|
||||
watch_path: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
suggested_folder: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
suggested_filename: Mapped[str | None] = mapped_column(String(500), nullable=True)
|
||||
|
||||
category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
|
||||
"CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
@@ -5,6 +5,8 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from sqlalchemy import or_
|
||||
|
||||
from app.database import AsyncSessionLocal, get_db
|
||||
from app.deps import get_user_id
|
||||
from app.models.category import DocumentCategory
|
||||
@@ -15,6 +17,9 @@ from app.services.ai_client import classify_document
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Sentinel user_id for watch-ingested categories — must match documents.py
|
||||
_WATCH_USER_ID = "watch"
|
||||
|
||||
_SIMILARITY_THRESHOLD = 0.4
|
||||
|
||||
|
||||
@@ -81,9 +86,10 @@ async def list_categories(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> list[DocumentCategory]:
|
||||
# Include watch-ingested categories so they appear in the sidebar/filter
|
||||
result = await db.execute(
|
||||
select(DocumentCategory)
|
||||
.where(DocumentCategory.user_id == user_id)
|
||||
.where(or_(DocumentCategory.user_id == user_id, DocumentCategory.user_id == _WATCH_USER_ID))
|
||||
.order_by(DocumentCategory.name)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
||||
@@ -26,13 +26,21 @@ router = APIRouter()
|
||||
|
||||
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
|
||||
|
||||
# Sentinel user_id used for watch-directory-ingested documents.
|
||||
# These documents are visible to all authenticated users.
|
||||
_WATCH_USER_ID = "watch"
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
|
||||
"""Fetch a document owned by user_id OR a watch-ingested document (visible to all)."""
|
||||
result = await db.execute(
|
||||
select(Document)
|
||||
.where(Document.id == doc_id, Document.user_id == user_id)
|
||||
.where(
|
||||
Document.id == doc_id,
|
||||
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
||||
)
|
||||
.options(
|
||||
selectinload(Document.category_assignments)
|
||||
.selectinload(CategoryAssignment.category)
|
||||
@@ -61,6 +69,10 @@ def _doc_with_categories(doc: Document) -> DocumentOut:
|
||||
created_at=doc.created_at,
|
||||
processed_at=doc.processed_at,
|
||||
categories=cats,
|
||||
source=doc.source,
|
||||
watch_path=doc.watch_path,
|
||||
suggested_folder=doc.suggested_folder,
|
||||
suggested_filename=doc.suggested_filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -183,7 +195,8 @@ async def list_documents(
|
||||
sort_expr = sort_col.desc() if order == "desc" else sort_col.asc()
|
||||
|
||||
# Build filter conditions once and reuse for both count + items queries.
|
||||
conditions = [Document.user_id == user_id]
|
||||
# Watch-ingested documents (user_id = "watch") are visible to all users.
|
||||
conditions = [or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID)]
|
||||
if status:
|
||||
conditions.append(Document.status == status)
|
||||
if document_type:
|
||||
@@ -247,7 +260,10 @@ async def get_document_status(
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> Document:
|
||||
result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
select(Document).where(
|
||||
Document.id == doc_id,
|
||||
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
||||
)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
@@ -347,7 +363,10 @@ async def download_file(
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> StreamingResponse:
|
||||
result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
select(Document).where(
|
||||
Document.id == doc_id,
|
||||
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
||||
)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
@@ -374,9 +393,12 @@ async def assign_category(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
# Verify both belong to this user
|
||||
# Verify the document is accessible (own or watch-ingested)
|
||||
doc_result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
select(Document).where(
|
||||
Document.id == doc_id,
|
||||
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
||||
)
|
||||
)
|
||||
if doc_result.scalar_one_or_none() is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
@@ -418,3 +440,81 @@ async def remove_category(
|
||||
if assignment:
|
||||
await db.delete(assignment)
|
||||
await db.commit()
|
||||
|
||||
|
||||
# ── AI suggestion confirmation ────────────────────────────────────────────────
|
||||
# These endpoints allow users to confirm or reject AI suggestions on
|
||||
# watch-ingested documents. No disk mutations — suggestions only update the DB.
|
||||
|
||||
@router.post("/{doc_id}/suggestions/folder/confirm", status_code=204)
|
||||
async def confirm_folder_suggestion(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
if not doc.suggested_folder:
|
||||
raise HTTPException(status_code=400, detail="No folder suggestion pending")
|
||||
|
||||
# Find or create the suggested category under the watch sentinel user
|
||||
cat_result = await db.execute(
|
||||
select(DocumentCategory).where(
|
||||
DocumentCategory.user_id == _WATCH_USER_ID,
|
||||
DocumentCategory.name == doc.suggested_folder,
|
||||
)
|
||||
)
|
||||
cat = cat_result.scalar_one_or_none()
|
||||
if cat is None:
|
||||
cat = DocumentCategory(user_id=_WATCH_USER_ID, name=doc.suggested_folder[:128])
|
||||
db.add(cat)
|
||||
await db.commit()
|
||||
await db.refresh(cat)
|
||||
|
||||
# Assign if not already assigned
|
||||
exists = await db.execute(
|
||||
select(CategoryAssignment).where(
|
||||
CategoryAssignment.document_id == doc_id,
|
||||
CategoryAssignment.category_id == cat.id,
|
||||
)
|
||||
)
|
||||
if exists.scalar_one_or_none() is None:
|
||||
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
|
||||
|
||||
doc.suggested_folder = None
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.post("/{doc_id}/suggestions/folder/reject", status_code=204)
|
||||
async def reject_folder_suggestion(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
doc.suggested_folder = None
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.post("/{doc_id}/suggestions/filename/confirm", status_code=204)
|
||||
async def confirm_filename_suggestion(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
if not doc.suggested_filename:
|
||||
raise HTTPException(status_code=400, detail="No filename suggestion pending")
|
||||
doc.title = doc.suggested_filename
|
||||
doc.suggested_filename = None
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.post("/{doc_id}/suggestions/filename/reject", status_code=204)
|
||||
async def reject_filename_suggestion(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
doc.suggested_filename = None
|
||||
await db.commit()
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Plugin manifest and settings endpoints for doc-service.
|
||||
|
||||
These are internal-only — they are called by the main backend's generic plugin
|
||||
proxy, never directly by the browser. No authentication is applied here because
|
||||
the backend enforces access control before forwarding the request.
|
||||
|
||||
Endpoints:
|
||||
GET /plugin/manifest → static manifest with JSON Schema for settings
|
||||
GET /plugin/settings → current storage config values
|
||||
PATCH /plugin/settings → update storage config (partial update)
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.services.config_reader import get_storage_config, save_storage_config
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_MANIFEST: dict = {
|
||||
"id": "doc-service",
|
||||
"name": "Document Service",
|
||||
"icon": "file-text",
|
||||
"version": "1.0",
|
||||
"access": {
|
||||
"allow_superuser": True,
|
||||
"required_groups": ["doc-service-admin"],
|
||||
},
|
||||
"settings_schema": {
|
||||
"type": "object",
|
||||
"title": "Storage & Watch",
|
||||
"properties": {
|
||||
"watch_enabled": {
|
||||
"type": "boolean",
|
||||
"title": "Enable file watching",
|
||||
"description": (
|
||||
"Automatically ingest PDF files added to the mounted watch directory. "
|
||||
"Requires a service restart to take effect after toggling."
|
||||
),
|
||||
},
|
||||
"watch_path": {
|
||||
"type": "string",
|
||||
"title": "Watch path",
|
||||
"readOnly": True,
|
||||
"description": "Configured via Docker volume mount — edit docker-compose to change.",
|
||||
},
|
||||
"ai_folder_suggestion": {
|
||||
"type": "boolean",
|
||||
"title": "AI folder suggestion",
|
||||
"description": (
|
||||
"AI suggests a category for each ingested document. "
|
||||
"You must confirm the suggestion before it is applied."
|
||||
),
|
||||
},
|
||||
"ai_folder_default": {
|
||||
"type": "string",
|
||||
"title": "Default import category",
|
||||
"description": "Category assigned automatically when AI folder suggestion is disabled.",
|
||||
},
|
||||
"ai_rename_suggestion": {
|
||||
"type": "boolean",
|
||||
"title": "AI rename suggestion",
|
||||
"description": (
|
||||
"AI suggests a document title for each ingested file. "
|
||||
"You must confirm before it is applied."
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class StorageSettingsUpdate(BaseModel):
|
||||
watch_enabled: bool | None = None
|
||||
ai_folder_suggestion: bool | None = None
|
||||
ai_folder_default: str | None = None
|
||||
ai_rename_suggestion: bool | None = None
|
||||
# watch_path is intentionally excluded — it cannot be changed via API
|
||||
|
||||
|
||||
@router.get("/manifest")
|
||||
async def get_manifest() -> dict:
|
||||
return _MANIFEST
|
||||
|
||||
|
||||
@router.get("/settings")
|
||||
async def get_settings() -> dict:
|
||||
return await get_storage_config()
|
||||
|
||||
|
||||
@router.patch("/settings")
|
||||
async def update_settings(body: StorageSettingsUpdate) -> dict:
|
||||
update = body.model_dump(exclude_none=True)
|
||||
if "ai_folder_default" in update:
|
||||
update["ai_folder_default"] = update["ai_folder_default"][:128].strip() or "imports"
|
||||
await save_storage_config(update)
|
||||
return await get_storage_config()
|
||||
@@ -23,6 +23,10 @@ class DocumentOut(BaseModel):
|
||||
created_at: datetime
|
||||
processed_at: datetime | None
|
||||
categories: list[CategoryOut] = []
|
||||
source: str = "upload"
|
||||
watch_path: str | None = None
|
||||
suggested_folder: str | None = None
|
||||
suggested_filename: str | None = None
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
@@ -14,6 +14,14 @@ from pathlib import Path
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
_DEFAULT_STORAGE_CONFIG: dict = {
|
||||
"watch_enabled": False,
|
||||
"watch_path": "/data/watch",
|
||||
"ai_folder_suggestion": False,
|
||||
"ai_folder_default": "imports",
|
||||
"ai_rename_suggestion": False,
|
||||
}
|
||||
|
||||
_DEFAULT_SYSTEM_PROMPT = (
|
||||
"You are a financial document analysis assistant. "
|
||||
"Given the text extracted from a PDF document, return ONLY a JSON object "
|
||||
@@ -43,6 +51,7 @@ _DEFAULT_USER_TEMPLATE = (
|
||||
|
||||
_DEFAULT_CONFIG: dict = {
|
||||
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
|
||||
"storage": _DEFAULT_STORAGE_CONFIG,
|
||||
"system_prompts": {
|
||||
"system": _DEFAULT_SYSTEM_PROMPT,
|
||||
"user_template": _DEFAULT_USER_TEMPLATE,
|
||||
@@ -64,6 +73,25 @@ def _read_config_sync() -> dict:
|
||||
return _apply_env_overrides(base)
|
||||
|
||||
|
||||
def _read_config_sync_raw() -> dict:
|
||||
"""Read without env overrides — used when we need to write back to disk."""
|
||||
path = Path(settings.CONFIG_PATH)
|
||||
if not path.exists():
|
||||
return deepcopy(_DEFAULT_CONFIG)
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def _write_config_sync(config: dict) -> None:
|
||||
"""Atomically write config JSON to disk."""
|
||||
path = Path(settings.CONFIG_PATH)
|
||||
tmp = path.with_suffix(".tmp")
|
||||
tmp.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(tmp, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
os.replace(tmp, path)
|
||||
|
||||
|
||||
def _apply_env_overrides(config: dict) -> dict:
|
||||
cfg = deepcopy(config)
|
||||
docs = cfg.setdefault("documents", {})
|
||||
@@ -84,3 +112,22 @@ async def load_doc_config() -> dict:
|
||||
_cache = data
|
||||
_cache_at = now
|
||||
return data
|
||||
|
||||
|
||||
async def get_storage_config() -> dict:
|
||||
"""Return storage config block, filling in defaults for any missing keys."""
|
||||
config = await load_doc_config()
|
||||
result = deepcopy(_DEFAULT_STORAGE_CONFIG)
|
||||
result.update(config.get("storage", {}))
|
||||
return result
|
||||
|
||||
|
||||
async def save_storage_config(data: dict) -> None:
|
||||
"""Merge data into the storage config block and persist to disk."""
|
||||
global _cache, _cache_at
|
||||
raw = await asyncio.to_thread(_read_config_sync_raw)
|
||||
raw.setdefault("storage", {}).update(data)
|
||||
await asyncio.to_thread(_write_config_sync, raw)
|
||||
# Invalidate cache so next read picks up the new values
|
||||
_cache = None
|
||||
_cache_at = 0.0
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
File-system watcher for the watch directory.
|
||||
|
||||
Uses the watchdog library to monitor a configured directory for new PDF files.
|
||||
When a PDF is detected, it is automatically ingested into the document service
|
||||
(copied to /data/documents, a DB record is created, and the AI pipeline runs).
|
||||
|
||||
Key design decisions:
|
||||
- No-remove policy: on_deleted and on_moved events are intentionally ignored.
|
||||
The watcher never deletes, moves, or modifies files on the watched volume.
|
||||
- Watch documents use user_id="watch" as a sentinel so they are visible to
|
||||
all authenticated users in the document list.
|
||||
- Subfolder names map to categories: a file at invoices/bill.pdf is assigned
|
||||
to a "invoices" category (auto-created if needed).
|
||||
- Suggestions: if ai_folder_suggestion or ai_rename_suggestion are enabled,
|
||||
the relevant fields are set on the document after AI processing so users
|
||||
can confirm/reject from the UI.
|
||||
- Thread → async bridge: watchdog runs in a daemon thread; asyncio coroutines
|
||||
are dispatched from that thread via run_coroutine_threadsafe.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
|
||||
from app.database import AsyncSessionLocal
|
||||
from app.models.category import DocumentCategory
|
||||
from app.models.category_assignment import CategoryAssignment
|
||||
from app.models.document import Document
|
||||
from app.services.storage import save_upload
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Must match _WATCH_USER_ID in app/routers/documents.py
|
||||
WATCH_USER_ID = "watch"
|
||||
|
||||
|
||||
# ── Ingestion logic ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
|
||||
"""
|
||||
Ingest a single PDF file from the watch directory.
|
||||
|
||||
Idempotent: skips files that already have a non-failed document record.
|
||||
"""
|
||||
from sqlalchemy import select
|
||||
|
||||
path = Path(path_str)
|
||||
if not path.exists() or not path.is_file():
|
||||
return
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
# Idempotency check — skip if already tracked (and not failed)
|
||||
existing_result = await db.execute(
|
||||
select(Document).where(Document.watch_path == path_str)
|
||||
)
|
||||
existing = existing_result.scalar_one_or_none()
|
||||
if existing is not None and existing.status != "failed":
|
||||
return
|
||||
|
||||
# Determine category from the first subfolder component
|
||||
try:
|
||||
rel = path.relative_to(watch_root)
|
||||
folder_name = rel.parts[0] if len(rel.parts) > 1 else None
|
||||
except ValueError:
|
||||
folder_name = None
|
||||
|
||||
# Read file bytes
|
||||
try:
|
||||
file_data = path.read_bytes()
|
||||
except OSError as exc:
|
||||
logger.warning("[watcher] Cannot read %s: %s", path_str, exc)
|
||||
return
|
||||
|
||||
# Save a copy to /data/documents/watch/{doc_id}.pdf
|
||||
doc_id = existing.id if existing is not None else str(uuid.uuid4())
|
||||
dest = await save_upload(file_data, WATCH_USER_ID, doc_id)
|
||||
|
||||
if existing is not None:
|
||||
# Re-ingest a previously failed document
|
||||
existing.file_path = str(dest)
|
||||
existing.file_size = len(file_data)
|
||||
existing.status = "pending"
|
||||
existing.error_message = None
|
||||
await db.commit()
|
||||
else:
|
||||
doc = Document(
|
||||
id=doc_id,
|
||||
user_id=WATCH_USER_ID,
|
||||
source="watch",
|
||||
watch_path=path_str,
|
||||
filename=path.name,
|
||||
file_path=str(dest),
|
||||
file_size=len(file_data),
|
||||
status="pending",
|
||||
)
|
||||
db.add(doc)
|
||||
await db.commit()
|
||||
|
||||
# Auto-assign category from subfolder name
|
||||
if folder_name:
|
||||
cat_result = await db.execute(
|
||||
select(DocumentCategory).where(
|
||||
DocumentCategory.user_id == WATCH_USER_ID,
|
||||
DocumentCategory.name == folder_name,
|
||||
)
|
||||
)
|
||||
cat = cat_result.scalar_one_or_none()
|
||||
if cat is None:
|
||||
cat = DocumentCategory(user_id=WATCH_USER_ID, name=folder_name[:128])
|
||||
db.add(cat)
|
||||
await db.commit()
|
||||
await db.refresh(cat)
|
||||
|
||||
exists_assign = await db.execute(
|
||||
select(CategoryAssignment).where(
|
||||
CategoryAssignment.document_id == doc_id,
|
||||
CategoryAssignment.category_id == cat.id,
|
||||
)
|
||||
)
|
||||
if exists_assign.scalar_one_or_none() is None:
|
||||
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
|
||||
await db.commit()
|
||||
|
||||
# Run AI pipeline (opens its own session internally)
|
||||
from app.routers.documents import process_document
|
||||
await process_document(doc_id)
|
||||
|
||||
# Set AI suggestions if enabled
|
||||
if config.get("ai_folder_suggestion") or config.get("ai_rename_suggestion"):
|
||||
await _apply_suggestions(doc_id, config)
|
||||
|
||||
|
||||
async def _apply_suggestions(doc_id: str, config: dict) -> None:
|
||||
"""Populate suggested_folder / suggested_filename after AI processing."""
|
||||
from sqlalchemy import select
|
||||
|
||||
async with AsyncSessionLocal() as db:
|
||||
result = await db.execute(select(Document).where(Document.id == doc_id))
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None or doc.status != "done" or not doc.extracted_data:
|
||||
return
|
||||
|
||||
try:
|
||||
extracted = json.loads(doc.extracted_data)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
changed = False
|
||||
if config.get("ai_folder_suggestion"):
|
||||
suggestions = extracted.get("suggested_categories", [])
|
||||
if suggestions:
|
||||
doc.suggested_folder = str(suggestions[0])[:128]
|
||||
changed = True
|
||||
|
||||
if config.get("ai_rename_suggestion"):
|
||||
title = extracted.get("title")
|
||||
if title:
|
||||
doc.suggested_filename = str(title)[:500]
|
||||
changed = True
|
||||
|
||||
if changed:
|
||||
await db.commit()
|
||||
|
||||
|
||||
# ── Watchdog event handler ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _PdfEventHandler(FileSystemEventHandler):
|
||||
def __init__(
|
||||
self,
|
||||
watch_root: Path,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
config: dict,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._watch_root = watch_root
|
||||
self._loop = loop
|
||||
self._config = config
|
||||
|
||||
def _dispatch_ingest(self, path_str: str) -> None:
|
||||
if path_str.lower().endswith(".pdf"):
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
ingest_file(path_str, self._watch_root, self._config),
|
||||
self._loop,
|
||||
)
|
||||
|
||||
def on_created(self, event): # type: ignore[override]
|
||||
if not event.is_directory:
|
||||
self._dispatch_ingest(event.src_path)
|
||||
|
||||
def on_moved(self, event): # type: ignore[override]
|
||||
# Handles atomic rename/move (e.g. Nextcloud or Syncthing completing a sync)
|
||||
if not event.is_directory:
|
||||
self._dispatch_ingest(event.dest_path)
|
||||
|
||||
# on_deleted / on_modified: intentionally not overridden — no-remove policy
|
||||
|
||||
|
||||
# ── Service ───────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class FileWatcherService:
|
||||
"""Manages the watchdog Observer lifecycle within the FastAPI lifespan."""
|
||||
|
||||
def __init__(self, loop: asyncio.AbstractEventLoop) -> None:
|
||||
self._loop = loop
|
||||
self._observer: Observer | None = None
|
||||
self._watch_root: Path | None = None
|
||||
self._config: dict = {}
|
||||
|
||||
async def start(self, watch_path: str, config: dict) -> None:
|
||||
self._watch_root = Path(watch_path)
|
||||
self._config = config
|
||||
|
||||
if not self._watch_root.exists():
|
||||
logger.warning(
|
||||
"[watcher] Watch path %s does not exist — file watching disabled",
|
||||
watch_path,
|
||||
)
|
||||
return
|
||||
|
||||
handler = _PdfEventHandler(self._watch_root, self._loop, config)
|
||||
self._observer = Observer()
|
||||
self._observer.schedule(handler, watch_path, recursive=True)
|
||||
self._observer.start()
|
||||
logger.info("[watcher] started, watching %s", watch_path)
|
||||
|
||||
# Run startup scan as a background task so startup is not blocked
|
||||
asyncio.create_task(self._scan_existing())
|
||||
|
||||
async def _scan_existing(self) -> None:
|
||||
"""Ingest any PDFs already present in the watch directory."""
|
||||
if self._watch_root is None:
|
||||
return
|
||||
logger.info("[watcher] scanning existing files in %s", self._watch_root)
|
||||
count = 0
|
||||
for pdf_path in sorted(self._watch_root.rglob("*.pdf")):
|
||||
try:
|
||||
await ingest_file(str(pdf_path), self._watch_root, self._config)
|
||||
count += 1
|
||||
except Exception as exc:
|
||||
logger.warning("[watcher] scan error for %s: %s", pdf_path, exc)
|
||||
logger.info("[watcher] startup scan complete — processed %d file(s)", count)
|
||||
|
||||
async def stop(self) -> None:
|
||||
if self._observer is not None:
|
||||
self._observer.stop()
|
||||
await asyncio.to_thread(self._observer.join)
|
||||
self._observer = None
|
||||
logger.info("[watcher] stopped")
|
||||
Reference in New Issue
Block a user