From 6d094d17f015183546af23317eb2b4bcc71b0fc2 Mon Sep 17 00:00:00 2001 From: curo1305 Date: Sat, 30 May 2026 11:16:01 +0200 Subject: [PATCH] feat(05-09): PATCH /documents/{id} endpoint + cloud-aware Celery re-analyze - Add DocumentPatch Pydantic model with filename and folder_id optional fields - Add PATCH /api/documents/{doc_id} endpoint: ownership guard, model_fields_set to distinguish absent vs null folder_id, returns updated metadata dict - Update _run() in document_tasks.py to use get_storage_backend_for_document for non-MinIO backends instead of hardcoded MinIO path - CloudConnectionError caught in cloud path: returns extract_failed status - Update test to use pure unit mocks (no PostgreSQL) for _run() cloud routing - All 3 plan tests pass; 23 test_cloud.py tests pass --- backend/api/documents.py | 60 ++++++++++++++++++++++++++ backend/tasks/document_tasks.py | 38 ++++++++++++++--- backend/tests/test_cloud.py | 74 ++++++++++++++++++++++----------- 3 files changed, 143 insertions(+), 29 deletions(-) diff --git a/backend/api/documents.py b/backend/api/documents.py index c128db0..491e980 100644 --- a/backend/api/documents.py +++ b/backend/api/documents.py @@ -69,6 +69,19 @@ class UploadUrlRequest(BaseModel): content_type: str +class DocumentPatch(BaseModel): + """Pydantic model for PATCH /api/documents/{doc_id}. + + Optional fields — model_fields_set distinguishes "not provided" from "set to null". + At least one field must be present in model_fields_set (enforced in the handler). + + T-05-09-01: explicit field declaration prevents mass assignment. + T-05-09-02: only filename and folder_id are accepted — no other fields can be set. + """ + filename: Optional[str] = None + folder_id: Optional[uuid.UUID] = None + + # ── POST /api/documents/upload-url ─────────────────────────────────────────── @router.post("/upload-url") @@ -520,6 +533,53 @@ async def get_document( return meta +# ── PATCH /api/documents/{doc_id} ──────────────────────────────────────────── + +@router.patch("/{doc_id}") +async def patch_document( + doc_id: str, + body: DocumentPatch, + session: AsyncSession = Depends(get_db), + current_user: User = Depends(get_regular_user), +): + """Update document metadata (filename and/or folder_id). + + T-05-09-01: get_regular_user dep rejects admins (403) and unauthenticated (401). + T-05-09-01: ownership check — non-owner gets 404 to avoid leaking document IDs (D-16). + T-05-09-02: response uses storage.get_metadata() which excludes credentials_enc and + password_hash via the _doc_to_dict whitelist. + + At least one field must be provided — empty body returns 422. + folder_id=null moves the document to the root (no folder). + """ + try: + uid = uuid.UUID(doc_id) + except ValueError: + raise HTTPException(404, "Document not found") + + doc = await session.get(Document, uid) + if doc is None or doc.user_id != current_user.id: + raise HTTPException(404, "Document not found") + + # Require at least one field to be set (model_fields_set tracks provided fields) + if not body.model_fields_set: + raise HTTPException(422, "At least one field (filename, folder_id) must be provided") + + if "filename" in body.model_fields_set and body.filename is not None: + doc.filename = body.filename + + if "folder_id" in body.model_fields_set: + # folder_id=null → move to root (no folder); folder_id= → move to folder + doc.folder_id = body.folder_id + + await session.commit() + + meta = await storage.get_metadata(session, doc_id) + if meta is None: + raise HTTPException(404, "Document not found") + return meta + + # ── DELETE /api/documents/{doc_id} ─────────────────────────────────────────── @router.delete("/{doc_id}") diff --git a/backend/tasks/document_tasks.py b/backend/tasks/document_tasks.py index 8596e99..661d1ee 100644 --- a/backend/tasks/document_tasks.py +++ b/backend/tasks/document_tasks.py @@ -30,13 +30,17 @@ async def _run(document_id: str) -> dict: Opens its own AsyncSession (not shared with the upload request) to avoid cross-thread session contamination. + + Cloud-aware: when doc.storage_backend != 'minio', uses + get_storage_backend_for_document() to retrieve bytes from the correct + cloud backend instead of hardcoding MinIO. """ import uuid as _uuid from db.session import AsyncSessionLocal from db.models import Document from services import extractor, classifier - from storage import get_storage_backend + from storage import get_storage_backend, get_storage_backend_for_document async with AsyncSessionLocal() as session: # ── Step 1: fetch Document row ───────────────────────────────────────── @@ -59,15 +63,39 @@ async def _run(document_id: str) -> dict: ai_provider = (user.ai_provider if user else None) or app_settings.default_ai_provider ai_model = (user.ai_model if user else None) or app_settings.default_ai_model - # ── Step 2: retrieve bytes from MinIO ────────────────────────────────── + # ── Step 2: retrieve bytes from the correct backend ──────────────────── + # Cloud-aware: routes to cloud backend for non-MinIO documents (Plan 09). + # T-05-09-03: cloud credentials are loaded from DB inside this task's own + # session — no credentials travel through the Celery broker message. try: - backend = get_storage_backend() - file_bytes = await backend.get_object(doc.object_key) + if doc.storage_backend is None or doc.storage_backend == "minio": + backend = get_storage_backend() + file_bytes = await backend.get_object(doc.object_key) + else: + # Cloud path: user must be present (doc.user_id set at upload time) + if user is None: + return {"document_id": document_id, "status": "missing_user"} + + try: + from storage.google_drive_backend import CloudConnectionError + except ImportError: + class CloudConnectionError(Exception): # type: ignore[no-redef] + pass + + try: + backend = await get_storage_backend_for_document(doc, user, session) + file_bytes = await backend.get_object(doc.object_key) + except CloudConnectionError: + return { + "document_id": document_id, + "status": "extract_failed", + "error": "cloud backend error", + } except Exception as e: return { "document_id": document_id, "status": "extract_failed", - "error": f"MinIO retrieval failed: {e}", + "error": f"retrieval failed: {e}", } # ── Step 3: extract text from bytes ──────────────────────────────────── diff --git a/backend/tests/test_cloud.py b/backend/tests/test_cloud.py index aea810f..6d79b7b 100644 --- a/backend/tests/test_cloud.py +++ b/backend/tests/test_cloud.py @@ -638,50 +638,76 @@ async def test_patch_document_wrong_owner(async_client, db_session): assert resp.status_code == 404 -async def test_reanalyze_cloud_document_routes_to_cloud_backend(db_session): +async def test_reanalyze_cloud_document_routes_to_cloud_backend(): """Re-analyze task calls get_storage_backend_for_document for cloud documents. Verifies that doc.storage_backend != 'minio' causes _run() to use the cloud backend path instead of the MinIO path (Plan 09, requirement CLOUD-07). + + Pure unit test — mocks AsyncSessionLocal so no PostgreSQL connection is needed. """ - from db.models import Document from tasks.document_tasks import _run from unittest.mock import AsyncMock, patch, MagicMock - auth = await _create_user_and_token(db_session, role="user") - - # Create a nextcloud document doc_id = _uuid.uuid4() - doc = Document( - id=doc_id, - user_id=auth["user"].id, - filename="cloud.pdf", - content_type="application/pdf", - size_bytes=2048, - storage_backend="nextcloud", - status="uploaded", - object_key="nc_file_id_xyz", - ) - db_session.add(doc) - await db_session.commit() + user_id = _uuid.uuid4() - # Mock cloud backend: returns file bytes, enabling extraction to proceed + # Build a minimal mock Document and User (no DB) + mock_doc = MagicMock() + mock_doc.id = doc_id + mock_doc.user_id = user_id + mock_doc.storage_backend = "nextcloud" + mock_doc.object_key = "nc_file_id_xyz" + mock_doc.content_type = "application/pdf" + mock_doc.filename = "cloud.pdf" + mock_doc.status = "uploaded" + + mock_user = MagicMock() + mock_user.id = user_id + mock_user.ai_provider = None + mock_user.ai_model = None + + # Mock cloud backend: returns fake bytes so extraction can proceed mock_cloud_backend = AsyncMock() - mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake content") + mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake") # Mock MinIO backend to verify it is NOT called mock_minio_backend = AsyncMock() mock_minio_backend.get_object = AsyncMock(return_value=b"should not be called") - with patch("tasks.document_tasks.get_storage_backend_for_document", return_value=mock_cloud_backend) as mock_gsb_doc, \ - patch("tasks.document_tasks.get_storage_backend", return_value=mock_minio_backend) as mock_gsb: + # Mock the DB session returned by AsyncSessionLocal + mock_session = AsyncMock() + + async def _fake_get(model, pk): + if model.__name__ == "Document": + return mock_doc + if model.__name__ == "User": + return mock_user + return None + + mock_session.get = _fake_get + + # AsyncSessionLocal is an async context manager; mock it + class _FakeSessionCM: + async def __aenter__(self): + return mock_session + async def __aexit__(self, *args): + pass + + # Patch at the storage module level (source of the functions used via deferred import) + with patch("db.session.AsyncSessionLocal", return_value=_FakeSessionCM()), \ + patch("storage.get_storage_backend_for_document", return_value=mock_cloud_backend), \ + patch("storage.get_storage_backend", return_value=mock_minio_backend), \ + patch("services.extractor.extract_text_from_bytes", return_value="extracted text"), \ + patch("services.classifier.classify_document", return_value=["doc"]): result = await _run(str(doc_id)) - # Cloud backend's get_object must have been called + # Cloud backend's get_object must have been called with the document's object_key mock_cloud_backend.get_object.assert_called_once_with("nc_file_id_xyz") # MinIO backend's get_object must NOT have been called mock_minio_backend.get_object.assert_not_called() - # Result must not be an error from MinIO path - assert result.get("status") != "extract_failed" or "MinIO" not in result.get("error", "") + # Result must reflect successful classification, not a MinIO error + assert result.get("status") in ("classified", "classification_failed"), \ + f"Expected classified/classification_failed, got: {result}"