feat(05-09): PATCH /documents/{id} endpoint + cloud-aware Celery re-analyze

- Add DocumentPatch Pydantic model with filename and folder_id optional fields
- Add PATCH /api/documents/{doc_id} endpoint: ownership guard, model_fields_set
  to distinguish absent vs null folder_id, returns updated metadata dict
- Update _run() in document_tasks.py to use get_storage_backend_for_document
  for non-MinIO backends instead of hardcoded MinIO path
- CloudConnectionError caught in cloud path: returns extract_failed status
- Update test to use pure unit mocks (no PostgreSQL) for _run() cloud routing
- All 3 plan tests pass; 23 test_cloud.py tests pass
This commit is contained in:
curo1305
2026-05-30 11:16:01 +02:00
parent 9bc056100c
commit 6d094d17f0
3 changed files with 143 additions and 29 deletions
+60
View File
@@ -69,6 +69,19 @@ class UploadUrlRequest(BaseModel):
content_type: str content_type: str
class DocumentPatch(BaseModel):
"""Pydantic model for PATCH /api/documents/{doc_id}.
Optional fields — model_fields_set distinguishes "not provided" from "set to null".
At least one field must be present in model_fields_set (enforced in the handler).
T-05-09-01: explicit field declaration prevents mass assignment.
T-05-09-02: only filename and folder_id are accepted — no other fields can be set.
"""
filename: Optional[str] = None
folder_id: Optional[uuid.UUID] = None
# ── POST /api/documents/upload-url ─────────────────────────────────────────── # ── POST /api/documents/upload-url ───────────────────────────────────────────
@router.post("/upload-url") @router.post("/upload-url")
@@ -520,6 +533,53 @@ async def get_document(
return meta return meta
# ── PATCH /api/documents/{doc_id} ────────────────────────────────────────────
@router.patch("/{doc_id}")
async def patch_document(
doc_id: str,
body: DocumentPatch,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Update document metadata (filename and/or folder_id).
T-05-09-01: get_regular_user dep rejects admins (403) and unauthenticated (401).
T-05-09-01: ownership check — non-owner gets 404 to avoid leaking document IDs (D-16).
T-05-09-02: response uses storage.get_metadata() which excludes credentials_enc and
password_hash via the _doc_to_dict whitelist.
At least one field must be provided — empty body returns 422.
folder_id=null moves the document to the root (no folder).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
# Require at least one field to be set (model_fields_set tracks provided fields)
if not body.model_fields_set:
raise HTTPException(422, "At least one field (filename, folder_id) must be provided")
if "filename" in body.model_fields_set and body.filename is not None:
doc.filename = body.filename
if "folder_id" in body.model_fields_set:
# folder_id=null → move to root (no folder); folder_id=<uuid> → move to folder
doc.folder_id = body.folder_id
await session.commit()
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
return meta
# ── DELETE /api/documents/{doc_id} ─────────────────────────────────────────── # ── DELETE /api/documents/{doc_id} ───────────────────────────────────────────
@router.delete("/{doc_id}") @router.delete("/{doc_id}")
+31 -3
View File
@@ -30,13 +30,17 @@ async def _run(document_id: str) -> dict:
Opens its own AsyncSession (not shared with the upload request) to avoid Opens its own AsyncSession (not shared with the upload request) to avoid
cross-thread session contamination. cross-thread session contamination.
Cloud-aware: when doc.storage_backend != 'minio', uses
get_storage_backend_for_document() to retrieve bytes from the correct
cloud backend instead of hardcoding MinIO.
""" """
import uuid as _uuid import uuid as _uuid
from db.session import AsyncSessionLocal from db.session import AsyncSessionLocal
from db.models import Document from db.models import Document
from services import extractor, classifier from services import extractor, classifier
from storage import get_storage_backend from storage import get_storage_backend, get_storage_backend_for_document
async with AsyncSessionLocal() as session: async with AsyncSessionLocal() as session:
# ── Step 1: fetch Document row ───────────────────────────────────────── # ── Step 1: fetch Document row ─────────────────────────────────────────
@@ -59,15 +63,39 @@ async def _run(document_id: str) -> dict:
ai_provider = (user.ai_provider if user else None) or app_settings.default_ai_provider ai_provider = (user.ai_provider if user else None) or app_settings.default_ai_provider
ai_model = (user.ai_model if user else None) or app_settings.default_ai_model ai_model = (user.ai_model if user else None) or app_settings.default_ai_model
# ── Step 2: retrieve bytes from MinIO ────────────────────────────────── # ── Step 2: retrieve bytes from the correct backend ────────────────────
# Cloud-aware: routes to cloud backend for non-MinIO documents (Plan 09).
# T-05-09-03: cloud credentials are loaded from DB inside this task's own
# session — no credentials travel through the Celery broker message.
try: try:
if doc.storage_backend is None or doc.storage_backend == "minio":
backend = get_storage_backend() backend = get_storage_backend()
file_bytes = await backend.get_object(doc.object_key) file_bytes = await backend.get_object(doc.object_key)
else:
# Cloud path: user must be present (doc.user_id set at upload time)
if user is None:
return {"document_id": document_id, "status": "missing_user"}
try:
from storage.google_drive_backend import CloudConnectionError
except ImportError:
class CloudConnectionError(Exception): # type: ignore[no-redef]
pass
try:
backend = await get_storage_backend_for_document(doc, user, session)
file_bytes = await backend.get_object(doc.object_key)
except CloudConnectionError:
return {
"document_id": document_id,
"status": "extract_failed",
"error": "cloud backend error",
}
except Exception as e: except Exception as e:
return { return {
"document_id": document_id, "document_id": document_id,
"status": "extract_failed", "status": "extract_failed",
"error": f"MinIO retrieval failed: {e}", "error": f"retrieval failed: {e}",
} }
# ── Step 3: extract text from bytes ──────────────────────────────────── # ── Step 3: extract text from bytes ────────────────────────────────────
+50 -24
View File
@@ -638,50 +638,76 @@ async def test_patch_document_wrong_owner(async_client, db_session):
assert resp.status_code == 404 assert resp.status_code == 404
async def test_reanalyze_cloud_document_routes_to_cloud_backend(db_session): async def test_reanalyze_cloud_document_routes_to_cloud_backend():
"""Re-analyze task calls get_storage_backend_for_document for cloud documents. """Re-analyze task calls get_storage_backend_for_document for cloud documents.
Verifies that doc.storage_backend != 'minio' causes _run() to use the cloud Verifies that doc.storage_backend != 'minio' causes _run() to use the cloud
backend path instead of the MinIO path (Plan 09, requirement CLOUD-07). backend path instead of the MinIO path (Plan 09, requirement CLOUD-07).
Pure unit test — mocks AsyncSessionLocal so no PostgreSQL connection is needed.
""" """
from db.models import Document
from tasks.document_tasks import _run from tasks.document_tasks import _run
from unittest.mock import AsyncMock, patch, MagicMock from unittest.mock import AsyncMock, patch, MagicMock
auth = await _create_user_and_token(db_session, role="user")
# Create a nextcloud document
doc_id = _uuid.uuid4() doc_id = _uuid.uuid4()
doc = Document( user_id = _uuid.uuid4()
id=doc_id,
user_id=auth["user"].id,
filename="cloud.pdf",
content_type="application/pdf",
size_bytes=2048,
storage_backend="nextcloud",
status="uploaded",
object_key="nc_file_id_xyz",
)
db_session.add(doc)
await db_session.commit()
# Mock cloud backend: returns file bytes, enabling extraction to proceed # Build a minimal mock Document and User (no DB)
mock_doc = MagicMock()
mock_doc.id = doc_id
mock_doc.user_id = user_id
mock_doc.storage_backend = "nextcloud"
mock_doc.object_key = "nc_file_id_xyz"
mock_doc.content_type = "application/pdf"
mock_doc.filename = "cloud.pdf"
mock_doc.status = "uploaded"
mock_user = MagicMock()
mock_user.id = user_id
mock_user.ai_provider = None
mock_user.ai_model = None
# Mock cloud backend: returns fake bytes so extraction can proceed
mock_cloud_backend = AsyncMock() mock_cloud_backend = AsyncMock()
mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake content") mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake")
# Mock MinIO backend to verify it is NOT called # Mock MinIO backend to verify it is NOT called
mock_minio_backend = AsyncMock() mock_minio_backend = AsyncMock()
mock_minio_backend.get_object = AsyncMock(return_value=b"should not be called") mock_minio_backend.get_object = AsyncMock(return_value=b"should not be called")
with patch("tasks.document_tasks.get_storage_backend_for_document", return_value=mock_cloud_backend) as mock_gsb_doc, \ # Mock the DB session returned by AsyncSessionLocal
patch("tasks.document_tasks.get_storage_backend", return_value=mock_minio_backend) as mock_gsb: mock_session = AsyncMock()
async def _fake_get(model, pk):
if model.__name__ == "Document":
return mock_doc
if model.__name__ == "User":
return mock_user
return None
mock_session.get = _fake_get
# AsyncSessionLocal is an async context manager; mock it
class _FakeSessionCM:
async def __aenter__(self):
return mock_session
async def __aexit__(self, *args):
pass
# Patch at the storage module level (source of the functions used via deferred import)
with patch("db.session.AsyncSessionLocal", return_value=_FakeSessionCM()), \
patch("storage.get_storage_backend_for_document", return_value=mock_cloud_backend), \
patch("storage.get_storage_backend", return_value=mock_minio_backend), \
patch("services.extractor.extract_text_from_bytes", return_value="extracted text"), \
patch("services.classifier.classify_document", return_value=["doc"]):
result = await _run(str(doc_id)) result = await _run(str(doc_id))
# Cloud backend's get_object must have been called # Cloud backend's get_object must have been called with the document's object_key
mock_cloud_backend.get_object.assert_called_once_with("nc_file_id_xyz") mock_cloud_backend.get_object.assert_called_once_with("nc_file_id_xyz")
# MinIO backend's get_object must NOT have been called # MinIO backend's get_object must NOT have been called
mock_minio_backend.get_object.assert_not_called() mock_minio_backend.get_object.assert_not_called()
# Result must not be an error from MinIO path # Result must reflect successful classification, not a MinIO error
assert result.get("status") != "extract_failed" or "MinIO" not in result.get("error", "") assert result.get("status") in ("classified", "classification_failed"), \
f"Expected classified/classification_failed, got: {result}"