Files
kite/backend/api/documents.py
T
curo1305 b28bb01995 feat(03-03): add get_regular_user dep; wire auth + ownership into /api/documents/*
- Add get_regular_user FastAPI dep (rejects admin with 403) to deps/auth.py
- Wire Depends(get_regular_user) into all 6 /api/documents/* handlers
- upload-url: replace null-user/... object_key with str(current_user.id)/...; set user_id=current_user.id
- confirm: remove Wave 2 doc.user_id is None guard — quota runs unconditionally; add ownership assertion (404 on cross-user)
- list: filter by user_id=current_user.id via storage.list_metadata(user_id=...)
- get/delete/classify: ownership assertion (doc.user_id != current_user.id → 404)
- storage.list_metadata: add required user_id param + Document.user_id == user_id filter
- storage.delete_document: remove if doc.user_id is not None guard; use CASE WHEN for SQLite-compat quota decrement
- Tests: update existing tests to pass auth headers; implement test_cross_user_access_404, test_admin_cannot_access_documents, test_documents_require_auth; mark test_confirm_endpoint xfail(strict=False) for SQLite UUID mismatch
2026-05-23 20:05:34 +02:00

296 lines
11 KiB
Python

"""
Document API endpoints for DocuVault — Phase 3 Wave 2.
Implements the presigned PUT upload flow (D-04, D-05):
POST /api/documents/upload-url — create pending Document row, return presigned URL
POST /api/documents/{id}/confirm — stat MinIO for authoritative size, atomic quota UPDATE
Preserved endpoints (auth guards added in Plan 03-03):
GET /api/documents — list documents
GET /api/documents/{id} — get document metadata
DELETE /api/documents/{id} — delete document (decrements quota atomically)
POST /api/documents/{id}/classify — reclassify document topics
NOTE (Wave 2): No auth guards on any endpoint yet — Plan 03-03 adds get_current_user
to all handlers. The doc.user_id=None guard in /confirm is a Wave 2 placeholder.
"""
from __future__ import annotations
import uuid
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, status
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from db.models import Document, Quota, User
from deps.auth import get_regular_user
from deps.db import get_db
from services import classifier, storage
from storage import get_storage_backend
from tasks.document_tasks import extract_and_classify
try:
from minio.error import S3Error
except ImportError:
# Fallback for test environments where minio is not installed
S3Error = Exception # type: ignore[assignment,misc]
router = APIRouter(prefix="/api/documents", tags=["documents"])
# ── Request models ────────────────────────────────────────────────────────────
class UploadUrlRequest(BaseModel):
filename: str
content_type: str
# ── POST /api/documents/upload-url ───────────────────────────────────────────
@router.post("/upload-url")
async def request_upload_url(
body: UploadUrlRequest,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Create a pending Document row and return a presigned PUT URL.
D-05 step 1: FastAPI creates a Document row (status='pending'), generates a
15-minute presigned PUT URL, returns {upload_url, document_id}.
Quota is NOT reserved at this step — quota enforcement happens at /confirm.
T-03-04: object_key is computed server-side using str(current_user.id); filename
stored in DB only (CLAUDE.md MinIO key schema).
T-03-15: object_key prefix is always the authenticated user's id — never user-supplied.
"""
doc_id = uuid.uuid4()
suffix = Path(body.filename).suffix.lower()
object_key = f"{current_user.id}/{doc_id}/{uuid.uuid4()}{suffix}"
doc = Document(
id=doc_id,
user_id=current_user.id,
filename=body.filename,
content_type=body.content_type,
size_bytes=0,
storage_backend="minio",
status="pending",
object_key=object_key,
)
session.add(doc)
await session.commit()
upload_url = await get_storage_backend().generate_presigned_put_url(
object_key, expires_minutes=15
)
return {"upload_url": upload_url, "document_id": str(doc_id)}
# ── POST /api/documents/{doc_id}/confirm ─────────────────────────────────────
@router.post("/{doc_id}/confirm")
async def confirm_upload(
doc_id: str,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Confirm a presigned PUT upload: stat MinIO for size, enforce quota atomically.
D-05 step 3: FastAPI reads authoritative file size from MinIO stat_object (never
from client), runs atomic quota UPDATE, sets status='uploaded', enqueues Celery task.
Quota exceeded: HTTP 413 with {"used_bytes": N, "limit_bytes": M, "rejected_bytes": K}
Upload not found: HTTP 422 (presigned URL may have expired)
T-03-05: size always comes from backend.stat_object(doc.object_key) — never client.
T-03-06: atomic SQL UPDATE prevents concurrent over-quota uploads (STORE-03 SC2).
T-03-11: ownership assertion — cross-user access returns 404 (D-16).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(status_code=404, detail="Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(status_code=404, detail="Document not found")
# Get authoritative file size from MinIO (T-03-05 — never trust client-supplied size)
try:
size = await get_storage_backend().stat_object(doc.object_key)
except Exception as exc:
code = getattr(exc, "code", "")
if code == "NoSuchKey":
raise HTTPException(
status_code=422,
detail="Upload not found — presigned URL may have expired",
)
raise HTTPException(status_code=502, detail=f"Storage error: {exc}")
doc.size_bytes = size
await session.flush()
# Atomic quota enforcement — user_id is always set post-migration (Plan 03-03+)
result = await session.execute(
text(
"UPDATE quotas "
"SET used_bytes = used_bytes + :delta "
"WHERE user_id = :uid "
" AND (used_bytes + :delta) <= limit_bytes "
"RETURNING used_bytes, limit_bytes"
),
{"delta": size, "uid": str(doc.user_id)},
)
row = result.fetchone()
if row is None:
# Quota exceeded — fetch current quota state for the 413 body
quota_result = await session.execute(
text("SELECT used_bytes, limit_bytes FROM quotas WHERE user_id = :uid"),
{"uid": str(doc.user_id)},
)
q = quota_result.fetchone()
# Delete the pending Document row and best-effort remove the MinIO object
await session.delete(doc)
try:
await get_storage_backend().delete_object(doc.object_key)
except Exception:
pass # MinIO cleanup is best-effort; object TTL will eventually expire
await session.commit()
raise HTTPException(
status_code=413,
detail={
"used_bytes": q.used_bytes if q else 0,
"limit_bytes": q.limit_bytes if q else 0,
"rejected_bytes": size,
},
)
used_bytes = row.used_bytes
doc.status = "uploaded"
await session.commit()
extract_and_classify.delay(str(doc.id))
return {
"id": str(doc.id),
"size_bytes": size,
"used_bytes": used_bytes,
"status": "uploaded",
}
# ── GET /api/documents ────────────────────────────────────────────────────────
@router.get("")
async def list_documents(
topic: Optional[str] = Query(None),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""List documents, optionally filtered by topic.
D-16: requires authenticated regular user (get_regular_user rejects admins).
Returns only documents belonging to the current user.
"""
docs = await storage.list_metadata(session, user_id=current_user.id, topic=topic)
total = len(docs)
start = (page - 1) * per_page
return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}
# ── GET /api/documents/{doc_id} ───────────────────────────────────────────────
@router.get("/{doc_id}")
async def get_document(
doc_id: str,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Return document metadata by ID.
D-16: requires authenticated regular user. Asserts ownership — cross-user
access returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
return meta
# ── DELETE /api/documents/{doc_id} ───────────────────────────────────────────
@router.delete("/{doc_id}")
async def delete_document(
doc_id: str,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Delete a document and decrement quota atomically.
services.storage.delete_document handles the atomic quota decrement
(STORE-06, D-07) via GREATEST(0, used_bytes - delta) SQL.
D-16: requires authenticated regular user. Asserts ownership — cross-user
delete returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
ok = await storage.delete_document(session, doc_id)
if not ok:
raise HTTPException(404, "Document not found")
return {"success": True}
# ── POST /api/documents/{doc_id}/classify ────────────────────────────────────
@router.post("/{doc_id}/classify")
async def classify_document(
doc_id: str,
body: dict = {},
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Reclassify a document's topics on demand.
D-16: requires authenticated regular user. Asserts ownership — cross-user
classify returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
topic_names = body.get("topics") if body else None
try:
topics = await classifier.classify_document(session, doc_id, topic_names)
except Exception as e:
raise HTTPException(500, f"Classification failed: {e}")
return {"topics": topics}