00466a9801
Introduces a manifest contract so feature containers self-describe their settings (JSON Schema + access rules). Backend and frontend gain generic plugin proxy and dynamic Extensions UI with zero feature-specific code. Doc-service is the first plugin consumer: exposes /plugin/manifest and /plugin/settings, adds a watchdog-based file watcher that auto-ingests PDFs from a mounted directory, maps subfolders to categories, supports AI-suggested folder/filename (user-confirmed), and enforces a no-remove policy. Access is gated by is_superuser or doc-service-admin group. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
521 lines
18 KiB
Python
521 lines
18 KiB
Python
import asyncio
|
|
import json
|
|
import math
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
|
|
import aiofiles
|
|
import pdfplumber
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, UploadFile
|
|
from fastapi.responses import StreamingResponse
|
|
from sqlalchemy import func, or_, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy.orm import selectinload
|
|
|
|
from app.database import AsyncSessionLocal, get_db
|
|
from app.deps import get_user_id
|
|
from app.models.category import DocumentCategory
|
|
from app.models.category_assignment import CategoryAssignment
|
|
from app.models.document import Document
|
|
from app.schemas.document import DocumentOut, DocumentPage, DocumentStatusOut, DocumentTypeUpdate, TagsUpdate, TitleUpdate
|
|
from app.services.ai_client import AIServiceError, classify_document
|
|
from app.services.config_reader import load_doc_config
|
|
from app.services.storage import delete_file, get_upload_path, save_upload
|
|
|
|
router = APIRouter()
|
|
|
|
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
|
|
|
|
# Sentinel user_id used for watch-directory-ingested documents.
|
|
# These documents are visible to all authenticated users.
|
|
_WATCH_USER_ID = "watch"
|
|
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
|
|
"""Fetch a document owned by user_id OR a watch-ingested document (visible to all)."""
|
|
result = await db.execute(
|
|
select(Document)
|
|
.where(
|
|
Document.id == doc_id,
|
|
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
|
)
|
|
.options(
|
|
selectinload(Document.category_assignments)
|
|
.selectinload(CategoryAssignment.category)
|
|
)
|
|
)
|
|
doc = result.scalar_one_or_none()
|
|
if doc is None:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
return doc
|
|
|
|
|
|
def _doc_with_categories(doc: Document) -> DocumentOut:
|
|
from app.schemas.document import CategoryOut
|
|
cats = [CategoryOut(id=a.category.id, name=a.category.name) for a in doc.category_assignments]
|
|
return DocumentOut(
|
|
id=doc.id,
|
|
user_id=doc.user_id,
|
|
filename=doc.filename,
|
|
title=doc.title,
|
|
file_size=doc.file_size,
|
|
status=doc.status,
|
|
document_type=doc.document_type,
|
|
extracted_data=doc.extracted_data,
|
|
tags=doc.tags,
|
|
error_message=doc.error_message,
|
|
created_at=doc.created_at,
|
|
processed_at=doc.processed_at,
|
|
categories=cats,
|
|
source=doc.source,
|
|
watch_path=doc.watch_path,
|
|
suggested_folder=doc.suggested_folder,
|
|
suggested_filename=doc.suggested_filename,
|
|
)
|
|
|
|
|
|
def _extract_pdf_text(file_path: str) -> str:
|
|
"""Synchronous — must be called via asyncio.to_thread."""
|
|
text_parts = []
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_parts.append(page_text)
|
|
return "\n".join(text_parts)
|
|
|
|
|
|
# ── Background processing ─────────────────────────────────────────────────────
|
|
|
|
async def process_document(doc_id: str) -> None:
|
|
"""
|
|
Runs after the upload response is sent.
|
|
Opens its own DB session — never use the request's Depends session here.
|
|
Loads AI config fresh from the config file so settings changes apply without restart.
|
|
"""
|
|
async with AsyncSessionLocal() as db:
|
|
doc = await db.get(Document, doc_id)
|
|
if doc is None:
|
|
return
|
|
|
|
doc.status = "processing"
|
|
await db.commit()
|
|
|
|
try:
|
|
text = await asyncio.to_thread(_extract_pdf_text, doc.file_path)
|
|
result = await classify_document(text)
|
|
|
|
doc.raw_text = text[:500_000] # cap stored text at 500k chars
|
|
doc.extracted_data = json.dumps(result)
|
|
doc.title = result.get("title") or None
|
|
doc.document_type = result.get("document_type", "unknown")
|
|
doc.tags = json.dumps(result.get("tags", []))
|
|
doc.status = "done"
|
|
doc.processed_at = datetime.now(timezone.utc)
|
|
except Exception as exc:
|
|
doc.status = "failed"
|
|
doc.error_message = str(exc)[:500]
|
|
|
|
await db.commit()
|
|
|
|
|
|
# ── Routes ────────────────────────────────────────────────────────────────────
|
|
|
|
@router.post("/upload", response_model=DocumentOut, status_code=202)
|
|
async def upload_document(
|
|
file: UploadFile,
|
|
background_tasks: BackgroundTasks,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
if file.content_type not in ("application/pdf", "application/octet-stream"):
|
|
if not (file.filename or "").lower().endswith(".pdf"):
|
|
raise HTTPException(status_code=415, detail="Only PDF files are accepted")
|
|
|
|
config = await load_doc_config()
|
|
max_bytes = config.get("documents", {}).get("max_pdf_bytes", _DEFAULT_MAX_BYTES)
|
|
|
|
file_data = await file.read()
|
|
if len(file_data) > max_bytes:
|
|
raise HTTPException(
|
|
status_code=413,
|
|
detail=f"File exceeds maximum size of {max_bytes // (1024*1024)} MB",
|
|
)
|
|
|
|
doc_id = str(uuid.uuid4())
|
|
dest = await save_upload(file_data, user_id, doc_id)
|
|
|
|
doc = Document(
|
|
id=doc_id,
|
|
user_id=user_id,
|
|
filename=file.filename or "upload.pdf",
|
|
file_path=str(dest),
|
|
file_size=len(file_data),
|
|
status="pending",
|
|
)
|
|
db.add(doc)
|
|
await db.commit()
|
|
|
|
background_tasks.add_task(process_document, doc_id)
|
|
|
|
# Re-query with selectinload so category_assignments is eagerly loaded.
|
|
# A new doc has no categories yet, but we need the relationship populated
|
|
# to avoid MissingGreenlet in the async session.
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
_SORT_COLUMNS = {
|
|
"created_at": Document.created_at,
|
|
"processed_at": Document.processed_at,
|
|
"filename": Document.filename,
|
|
"title": Document.title,
|
|
"file_size": Document.file_size,
|
|
"status": Document.status,
|
|
"document_type": Document.document_type,
|
|
}
|
|
|
|
|
|
@router.get("", response_model=DocumentPage)
|
|
async def list_documents(
|
|
page: int = Query(default=1, ge=1),
|
|
per_page: int = Query(default=20, ge=1, le=100),
|
|
sort: str = Query(default="created_at"),
|
|
order: str = Query(default="desc", pattern="^(asc|desc)$"),
|
|
status: str | None = Query(default=None),
|
|
document_type: str | None = Query(default=None),
|
|
search: str | None = Query(default=None),
|
|
category_id: str | None = Query(default=None),
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentPage:
|
|
sort_col = _SORT_COLUMNS.get(sort, Document.created_at)
|
|
sort_expr = sort_col.desc() if order == "desc" else sort_col.asc()
|
|
|
|
# Build filter conditions once and reuse for both count + items queries.
|
|
# Watch-ingested documents (user_id = "watch") are visible to all users.
|
|
conditions = [or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID)]
|
|
if status:
|
|
conditions.append(Document.status == status)
|
|
if document_type:
|
|
conditions.append(Document.document_type == document_type)
|
|
if search:
|
|
like = f"%{search}%"
|
|
conditions.append(
|
|
or_(
|
|
Document.title.ilike(like),
|
|
Document.filename.ilike(like),
|
|
Document.tags.ilike(like),
|
|
Document.document_type.ilike(like),
|
|
)
|
|
)
|
|
if category_id:
|
|
subq = select(CategoryAssignment.document_id).where(
|
|
CategoryAssignment.category_id == category_id
|
|
)
|
|
conditions.append(Document.id.in_(subq))
|
|
|
|
count_result = await db.execute(
|
|
select(func.count(Document.id)).where(*conditions)
|
|
)
|
|
total = count_result.scalar_one()
|
|
|
|
items_result = await db.execute(
|
|
select(Document)
|
|
.where(*conditions)
|
|
.options(
|
|
selectinload(Document.category_assignments)
|
|
.selectinload(CategoryAssignment.category)
|
|
)
|
|
.order_by(sort_expr)
|
|
.offset((page - 1) * per_page)
|
|
.limit(per_page)
|
|
)
|
|
items = [_doc_with_categories(d) for d in items_result.scalars().all()]
|
|
|
|
return DocumentPage(
|
|
items=items,
|
|
total=total,
|
|
page=page,
|
|
pages=max(1, math.ceil(total / per_page)),
|
|
)
|
|
|
|
|
|
@router.get("/{doc_id}", response_model=DocumentOut)
|
|
async def get_document(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
@router.get("/{doc_id}/status", response_model=DocumentStatusOut)
|
|
async def get_document_status(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> Document:
|
|
result = await db.execute(
|
|
select(Document).where(
|
|
Document.id == doc_id,
|
|
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
|
)
|
|
)
|
|
doc = result.scalar_one_or_none()
|
|
if doc is None:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
return doc
|
|
|
|
|
|
@router.patch("/{doc_id}/type", response_model=DocumentOut)
|
|
async def update_document_type(
|
|
doc_id: str,
|
|
body: DocumentTypeUpdate,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
doc.document_type = body.document_type
|
|
await db.commit()
|
|
await db.refresh(doc)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
@router.patch("/{doc_id}/tags", response_model=DocumentOut)
|
|
async def update_document_tags(
|
|
doc_id: str,
|
|
body: TagsUpdate,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
# Normalise: strip whitespace, drop empties, deduplicate while preserving order
|
|
seen: set[str] = set()
|
|
clean: list[str] = []
|
|
for t in body.tags:
|
|
t = t.strip()
|
|
if t and t.lower() not in seen:
|
|
seen.add(t.lower())
|
|
clean.append(t)
|
|
doc.tags = json.dumps(clean)
|
|
await db.commit()
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
@router.patch("/{doc_id}/title", response_model=DocumentOut)
|
|
async def update_document_title(
|
|
doc_id: str,
|
|
body: TitleUpdate,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
doc.title = body.title.strip() or None
|
|
await db.commit()
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
@router.post("/{doc_id}/reprocess", response_model=DocumentOut)
|
|
async def reprocess_document(
|
|
doc_id: str,
|
|
background_tasks: BackgroundTasks,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> DocumentOut:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
if doc.status in ("pending", "processing"):
|
|
raise HTTPException(status_code=409, detail="Document is already being processed")
|
|
doc.status = "pending"
|
|
doc.error_message = None
|
|
await db.commit()
|
|
background_tasks.add_task(process_document, doc_id)
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
return _doc_with_categories(doc)
|
|
|
|
|
|
@router.delete("/{doc_id}", status_code=204)
|
|
async def delete_document(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
result = await db.execute(
|
|
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
|
)
|
|
doc = result.scalar_one_or_none()
|
|
if doc is None:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
delete_file(doc.file_path)
|
|
await db.delete(doc)
|
|
await db.commit()
|
|
|
|
|
|
@router.get("/{doc_id}/file")
|
|
async def download_file(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> StreamingResponse:
|
|
result = await db.execute(
|
|
select(Document).where(
|
|
Document.id == doc_id,
|
|
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
|
)
|
|
)
|
|
doc = result.scalar_one_or_none()
|
|
if doc is None:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
async def file_generator():
|
|
async with aiofiles.open(doc.file_path, "rb") as f:
|
|
while chunk := await f.read(64 * 1024):
|
|
yield chunk
|
|
|
|
return StreamingResponse(
|
|
file_generator(),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f'inline; filename="{doc.filename}"'},
|
|
)
|
|
|
|
|
|
# ── Category assignment ───────────────────────────────────────────────────────
|
|
|
|
@router.post("/{doc_id}/categories/{cat_id}", status_code=204)
|
|
async def assign_category(
|
|
doc_id: str,
|
|
cat_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
# Verify the document is accessible (own or watch-ingested)
|
|
doc_result = await db.execute(
|
|
select(Document).where(
|
|
Document.id == doc_id,
|
|
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
|
|
)
|
|
)
|
|
if doc_result.scalar_one_or_none() is None:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
cat_result = await db.execute(
|
|
select(DocumentCategory).where(
|
|
DocumentCategory.id == cat_id, DocumentCategory.user_id == user_id
|
|
)
|
|
)
|
|
if cat_result.scalar_one_or_none() is None:
|
|
raise HTTPException(status_code=404, detail="Category not found")
|
|
|
|
# Upsert — ignore if already assigned
|
|
existing = await db.execute(
|
|
select(CategoryAssignment).where(
|
|
CategoryAssignment.document_id == doc_id,
|
|
CategoryAssignment.category_id == cat_id,
|
|
)
|
|
)
|
|
if existing.scalar_one_or_none() is None:
|
|
db.add(CategoryAssignment(document_id=doc_id, category_id=cat_id))
|
|
await db.commit()
|
|
|
|
|
|
@router.delete("/{doc_id}/categories/{cat_id}", status_code=204)
|
|
async def remove_category(
|
|
doc_id: str,
|
|
cat_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
result = await db.execute(
|
|
select(CategoryAssignment).where(
|
|
CategoryAssignment.document_id == doc_id,
|
|
CategoryAssignment.category_id == cat_id,
|
|
)
|
|
)
|
|
assignment = result.scalar_one_or_none()
|
|
if assignment:
|
|
await db.delete(assignment)
|
|
await db.commit()
|
|
|
|
|
|
# ── AI suggestion confirmation ────────────────────────────────────────────────
|
|
# These endpoints allow users to confirm or reject AI suggestions on
|
|
# watch-ingested documents. No disk mutations — suggestions only update the DB.
|
|
|
|
@router.post("/{doc_id}/suggestions/folder/confirm", status_code=204)
|
|
async def confirm_folder_suggestion(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
if not doc.suggested_folder:
|
|
raise HTTPException(status_code=400, detail="No folder suggestion pending")
|
|
|
|
# Find or create the suggested category under the watch sentinel user
|
|
cat_result = await db.execute(
|
|
select(DocumentCategory).where(
|
|
DocumentCategory.user_id == _WATCH_USER_ID,
|
|
DocumentCategory.name == doc.suggested_folder,
|
|
)
|
|
)
|
|
cat = cat_result.scalar_one_or_none()
|
|
if cat is None:
|
|
cat = DocumentCategory(user_id=_WATCH_USER_ID, name=doc.suggested_folder[:128])
|
|
db.add(cat)
|
|
await db.commit()
|
|
await db.refresh(cat)
|
|
|
|
# Assign if not already assigned
|
|
exists = await db.execute(
|
|
select(CategoryAssignment).where(
|
|
CategoryAssignment.document_id == doc_id,
|
|
CategoryAssignment.category_id == cat.id,
|
|
)
|
|
)
|
|
if exists.scalar_one_or_none() is None:
|
|
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
|
|
|
|
doc.suggested_folder = None
|
|
await db.commit()
|
|
|
|
|
|
@router.post("/{doc_id}/suggestions/folder/reject", status_code=204)
|
|
async def reject_folder_suggestion(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
doc.suggested_folder = None
|
|
await db.commit()
|
|
|
|
|
|
@router.post("/{doc_id}/suggestions/filename/confirm", status_code=204)
|
|
async def confirm_filename_suggestion(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
if not doc.suggested_filename:
|
|
raise HTTPException(status_code=400, detail="No filename suggestion pending")
|
|
doc.title = doc.suggested_filename
|
|
doc.suggested_filename = None
|
|
await db.commit()
|
|
|
|
|
|
@router.post("/{doc_id}/suggestions/filename/reject", status_code=204)
|
|
async def reject_filename_suggestion(
|
|
doc_id: str,
|
|
user_id: str = Depends(get_user_id),
|
|
db: AsyncSession = Depends(get_db),
|
|
) -> None:
|
|
doc = await _get_user_doc(doc_id, user_id, db)
|
|
doc.suggested_filename = None
|
|
await db.commit()
|