feat(05-09): PATCH /documents/{id} endpoint + cloud-aware Celery re-analyze

- Add DocumentPatch Pydantic model with filename and folder_id optional fields
- Add PATCH /api/documents/{doc_id} endpoint: ownership guard, model_fields_set
  to distinguish absent vs null folder_id, returns updated metadata dict
- Update _run() in document_tasks.py to use get_storage_backend_for_document
  for non-MinIO backends instead of hardcoded MinIO path
- CloudConnectionError caught in cloud path: returns extract_failed status
- Update test to use pure unit mocks (no PostgreSQL) for _run() cloud routing
- All 3 plan tests pass; 23 test_cloud.py tests pass
This commit is contained in:
curo1305
2026-05-30 11:16:01 +02:00
parent 9bc056100c
commit 6d094d17f0
3 changed files with 143 additions and 29 deletions
+60
View File
@@ -69,6 +69,19 @@ class UploadUrlRequest(BaseModel):
content_type: str
class DocumentPatch(BaseModel):
"""Pydantic model for PATCH /api/documents/{doc_id}.
Optional fields — model_fields_set distinguishes "not provided" from "set to null".
At least one field must be present in model_fields_set (enforced in the handler).
T-05-09-01: explicit field declaration prevents mass assignment.
T-05-09-02: only filename and folder_id are accepted — no other fields can be set.
"""
filename: Optional[str] = None
folder_id: Optional[uuid.UUID] = None
# ── POST /api/documents/upload-url ───────────────────────────────────────────
@router.post("/upload-url")
@@ -520,6 +533,53 @@ async def get_document(
return meta
# ── PATCH /api/documents/{doc_id} ────────────────────────────────────────────
@router.patch("/{doc_id}")
async def patch_document(
doc_id: str,
body: DocumentPatch,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Update document metadata (filename and/or folder_id).
T-05-09-01: get_regular_user dep rejects admins (403) and unauthenticated (401).
T-05-09-01: ownership check — non-owner gets 404 to avoid leaking document IDs (D-16).
T-05-09-02: response uses storage.get_metadata() which excludes credentials_enc and
password_hash via the _doc_to_dict whitelist.
At least one field must be provided — empty body returns 422.
folder_id=null moves the document to the root (no folder).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
# Require at least one field to be set (model_fields_set tracks provided fields)
if not body.model_fields_set:
raise HTTPException(422, "At least one field (filename, folder_id) must be provided")
if "filename" in body.model_fields_set and body.filename is not None:
doc.filename = body.filename
if "folder_id" in body.model_fields_set:
# folder_id=null → move to root (no folder); folder_id=<uuid> → move to folder
doc.folder_id = body.folder_id
await session.commit()
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
return meta
# ── DELETE /api/documents/{doc_id} ───────────────────────────────────────────
@router.delete("/{doc_id}")
+33 -5
View File
@@ -30,13 +30,17 @@ async def _run(document_id: str) -> dict:
Opens its own AsyncSession (not shared with the upload request) to avoid
cross-thread session contamination.
Cloud-aware: when doc.storage_backend != 'minio', uses
get_storage_backend_for_document() to retrieve bytes from the correct
cloud backend instead of hardcoding MinIO.
"""
import uuid as _uuid
from db.session import AsyncSessionLocal
from db.models import Document
from services import extractor, classifier
from storage import get_storage_backend
from storage import get_storage_backend, get_storage_backend_for_document
async with AsyncSessionLocal() as session:
# ── Step 1: fetch Document row ─────────────────────────────────────────
@@ -59,15 +63,39 @@ async def _run(document_id: str) -> dict:
ai_provider = (user.ai_provider if user else None) or app_settings.default_ai_provider
ai_model = (user.ai_model if user else None) or app_settings.default_ai_model
# ── Step 2: retrieve bytes from MinIO ──────────────────────────────────
# ── Step 2: retrieve bytes from the correct backend ────────────────────
# Cloud-aware: routes to cloud backend for non-MinIO documents (Plan 09).
# T-05-09-03: cloud credentials are loaded from DB inside this task's own
# session — no credentials travel through the Celery broker message.
try:
backend = get_storage_backend()
file_bytes = await backend.get_object(doc.object_key)
if doc.storage_backend is None or doc.storage_backend == "minio":
backend = get_storage_backend()
file_bytes = await backend.get_object(doc.object_key)
else:
# Cloud path: user must be present (doc.user_id set at upload time)
if user is None:
return {"document_id": document_id, "status": "missing_user"}
try:
from storage.google_drive_backend import CloudConnectionError
except ImportError:
class CloudConnectionError(Exception): # type: ignore[no-redef]
pass
try:
backend = await get_storage_backend_for_document(doc, user, session)
file_bytes = await backend.get_object(doc.object_key)
except CloudConnectionError:
return {
"document_id": document_id,
"status": "extract_failed",
"error": "cloud backend error",
}
except Exception as e:
return {
"document_id": document_id,
"status": "extract_failed",
"error": f"MinIO retrieval failed: {e}",
"error": f"retrieval failed: {e}",
}
# ── Step 3: extract text from bytes ────────────────────────────────────
+50 -24
View File
@@ -638,50 +638,76 @@ async def test_patch_document_wrong_owner(async_client, db_session):
assert resp.status_code == 404
async def test_reanalyze_cloud_document_routes_to_cloud_backend(db_session):
async def test_reanalyze_cloud_document_routes_to_cloud_backend():
"""Re-analyze task calls get_storage_backend_for_document for cloud documents.
Verifies that doc.storage_backend != 'minio' causes _run() to use the cloud
backend path instead of the MinIO path (Plan 09, requirement CLOUD-07).
Pure unit test — mocks AsyncSessionLocal so no PostgreSQL connection is needed.
"""
from db.models import Document
from tasks.document_tasks import _run
from unittest.mock import AsyncMock, patch, MagicMock
auth = await _create_user_and_token(db_session, role="user")
# Create a nextcloud document
doc_id = _uuid.uuid4()
doc = Document(
id=doc_id,
user_id=auth["user"].id,
filename="cloud.pdf",
content_type="application/pdf",
size_bytes=2048,
storage_backend="nextcloud",
status="uploaded",
object_key="nc_file_id_xyz",
)
db_session.add(doc)
await db_session.commit()
user_id = _uuid.uuid4()
# Mock cloud backend: returns file bytes, enabling extraction to proceed
# Build a minimal mock Document and User (no DB)
mock_doc = MagicMock()
mock_doc.id = doc_id
mock_doc.user_id = user_id
mock_doc.storage_backend = "nextcloud"
mock_doc.object_key = "nc_file_id_xyz"
mock_doc.content_type = "application/pdf"
mock_doc.filename = "cloud.pdf"
mock_doc.status = "uploaded"
mock_user = MagicMock()
mock_user.id = user_id
mock_user.ai_provider = None
mock_user.ai_model = None
# Mock cloud backend: returns fake bytes so extraction can proceed
mock_cloud_backend = AsyncMock()
mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake content")
mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake")
# Mock MinIO backend to verify it is NOT called
mock_minio_backend = AsyncMock()
mock_minio_backend.get_object = AsyncMock(return_value=b"should not be called")
with patch("tasks.document_tasks.get_storage_backend_for_document", return_value=mock_cloud_backend) as mock_gsb_doc, \
patch("tasks.document_tasks.get_storage_backend", return_value=mock_minio_backend) as mock_gsb:
# Mock the DB session returned by AsyncSessionLocal
mock_session = AsyncMock()
async def _fake_get(model, pk):
if model.__name__ == "Document":
return mock_doc
if model.__name__ == "User":
return mock_user
return None
mock_session.get = _fake_get
# AsyncSessionLocal is an async context manager; mock it
class _FakeSessionCM:
async def __aenter__(self):
return mock_session
async def __aexit__(self, *args):
pass
# Patch at the storage module level (source of the functions used via deferred import)
with patch("db.session.AsyncSessionLocal", return_value=_FakeSessionCM()), \
patch("storage.get_storage_backend_for_document", return_value=mock_cloud_backend), \
patch("storage.get_storage_backend", return_value=mock_minio_backend), \
patch("services.extractor.extract_text_from_bytes", return_value="extracted text"), \
patch("services.classifier.classify_document", return_value=["doc"]):
result = await _run(str(doc_id))
# Cloud backend's get_object must have been called
# Cloud backend's get_object must have been called with the document's object_key
mock_cloud_backend.get_object.assert_called_once_with("nc_file_id_xyz")
# MinIO backend's get_object must NOT have been called
mock_minio_backend.get_object.assert_not_called()
# Result must not be an error from MinIO path
assert result.get("status") != "extract_failed" or "MinIO" not in result.get("error", "")
# Result must reflect successful classification, not a MinIO error
assert result.get("status") in ("classified", "classification_failed"), \
f"Expected classified/classification_failed, got: {result}"