feat(05-09): PATCH /documents/{id} endpoint + cloud-aware Celery re-analyze
- Add DocumentPatch Pydantic model with filename and folder_id optional fields
- Add PATCH /api/documents/{doc_id} endpoint: ownership guard, model_fields_set
to distinguish absent vs null folder_id, returns updated metadata dict
- Update _run() in document_tasks.py to use get_storage_backend_for_document
for non-MinIO backends instead of hardcoded MinIO path
- CloudConnectionError caught in cloud path: returns extract_failed status
- Update test to use pure unit mocks (no PostgreSQL) for _run() cloud routing
- All 3 plan tests pass; 23 test_cloud.py tests pass
This commit is contained in:
@@ -69,6 +69,19 @@ class UploadUrlRequest(BaseModel):
|
||||
content_type: str
|
||||
|
||||
|
||||
class DocumentPatch(BaseModel):
|
||||
"""Pydantic model for PATCH /api/documents/{doc_id}.
|
||||
|
||||
Optional fields — model_fields_set distinguishes "not provided" from "set to null".
|
||||
At least one field must be present in model_fields_set (enforced in the handler).
|
||||
|
||||
T-05-09-01: explicit field declaration prevents mass assignment.
|
||||
T-05-09-02: only filename and folder_id are accepted — no other fields can be set.
|
||||
"""
|
||||
filename: Optional[str] = None
|
||||
folder_id: Optional[uuid.UUID] = None
|
||||
|
||||
|
||||
# ── POST /api/documents/upload-url ───────────────────────────────────────────
|
||||
|
||||
@router.post("/upload-url")
|
||||
@@ -520,6 +533,53 @@ async def get_document(
|
||||
return meta
|
||||
|
||||
|
||||
# ── PATCH /api/documents/{doc_id} ────────────────────────────────────────────
|
||||
|
||||
@router.patch("/{doc_id}")
|
||||
async def patch_document(
|
||||
doc_id: str,
|
||||
body: DocumentPatch,
|
||||
session: AsyncSession = Depends(get_db),
|
||||
current_user: User = Depends(get_regular_user),
|
||||
):
|
||||
"""Update document metadata (filename and/or folder_id).
|
||||
|
||||
T-05-09-01: get_regular_user dep rejects admins (403) and unauthenticated (401).
|
||||
T-05-09-01: ownership check — non-owner gets 404 to avoid leaking document IDs (D-16).
|
||||
T-05-09-02: response uses storage.get_metadata() which excludes credentials_enc and
|
||||
password_hash via the _doc_to_dict whitelist.
|
||||
|
||||
At least one field must be provided — empty body returns 422.
|
||||
folder_id=null moves the document to the root (no folder).
|
||||
"""
|
||||
try:
|
||||
uid = uuid.UUID(doc_id)
|
||||
except ValueError:
|
||||
raise HTTPException(404, "Document not found")
|
||||
|
||||
doc = await session.get(Document, uid)
|
||||
if doc is None or doc.user_id != current_user.id:
|
||||
raise HTTPException(404, "Document not found")
|
||||
|
||||
# Require at least one field to be set (model_fields_set tracks provided fields)
|
||||
if not body.model_fields_set:
|
||||
raise HTTPException(422, "At least one field (filename, folder_id) must be provided")
|
||||
|
||||
if "filename" in body.model_fields_set and body.filename is not None:
|
||||
doc.filename = body.filename
|
||||
|
||||
if "folder_id" in body.model_fields_set:
|
||||
# folder_id=null → move to root (no folder); folder_id=<uuid> → move to folder
|
||||
doc.folder_id = body.folder_id
|
||||
|
||||
await session.commit()
|
||||
|
||||
meta = await storage.get_metadata(session, doc_id)
|
||||
if meta is None:
|
||||
raise HTTPException(404, "Document not found")
|
||||
return meta
|
||||
|
||||
|
||||
# ── DELETE /api/documents/{doc_id} ───────────────────────────────────────────
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
|
||||
@@ -30,13 +30,17 @@ async def _run(document_id: str) -> dict:
|
||||
|
||||
Opens its own AsyncSession (not shared with the upload request) to avoid
|
||||
cross-thread session contamination.
|
||||
|
||||
Cloud-aware: when doc.storage_backend != 'minio', uses
|
||||
get_storage_backend_for_document() to retrieve bytes from the correct
|
||||
cloud backend instead of hardcoding MinIO.
|
||||
"""
|
||||
import uuid as _uuid
|
||||
|
||||
from db.session import AsyncSessionLocal
|
||||
from db.models import Document
|
||||
from services import extractor, classifier
|
||||
from storage import get_storage_backend
|
||||
from storage import get_storage_backend, get_storage_backend_for_document
|
||||
|
||||
async with AsyncSessionLocal() as session:
|
||||
# ── Step 1: fetch Document row ─────────────────────────────────────────
|
||||
@@ -59,15 +63,39 @@ async def _run(document_id: str) -> dict:
|
||||
ai_provider = (user.ai_provider if user else None) or app_settings.default_ai_provider
|
||||
ai_model = (user.ai_model if user else None) or app_settings.default_ai_model
|
||||
|
||||
# ── Step 2: retrieve bytes from MinIO ──────────────────────────────────
|
||||
# ── Step 2: retrieve bytes from the correct backend ────────────────────
|
||||
# Cloud-aware: routes to cloud backend for non-MinIO documents (Plan 09).
|
||||
# T-05-09-03: cloud credentials are loaded from DB inside this task's own
|
||||
# session — no credentials travel through the Celery broker message.
|
||||
try:
|
||||
if doc.storage_backend is None or doc.storage_backend == "minio":
|
||||
backend = get_storage_backend()
|
||||
file_bytes = await backend.get_object(doc.object_key)
|
||||
else:
|
||||
# Cloud path: user must be present (doc.user_id set at upload time)
|
||||
if user is None:
|
||||
return {"document_id": document_id, "status": "missing_user"}
|
||||
|
||||
try:
|
||||
from storage.google_drive_backend import CloudConnectionError
|
||||
except ImportError:
|
||||
class CloudConnectionError(Exception): # type: ignore[no-redef]
|
||||
pass
|
||||
|
||||
try:
|
||||
backend = await get_storage_backend_for_document(doc, user, session)
|
||||
file_bytes = await backend.get_object(doc.object_key)
|
||||
except CloudConnectionError:
|
||||
return {
|
||||
"document_id": document_id,
|
||||
"status": "extract_failed",
|
||||
"error": "cloud backend error",
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"document_id": document_id,
|
||||
"status": "extract_failed",
|
||||
"error": f"MinIO retrieval failed: {e}",
|
||||
"error": f"retrieval failed: {e}",
|
||||
}
|
||||
|
||||
# ── Step 3: extract text from bytes ────────────────────────────────────
|
||||
|
||||
+50
-24
@@ -638,50 +638,76 @@ async def test_patch_document_wrong_owner(async_client, db_session):
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
async def test_reanalyze_cloud_document_routes_to_cloud_backend(db_session):
|
||||
async def test_reanalyze_cloud_document_routes_to_cloud_backend():
|
||||
"""Re-analyze task calls get_storage_backend_for_document for cloud documents.
|
||||
|
||||
Verifies that doc.storage_backend != 'minio' causes _run() to use the cloud
|
||||
backend path instead of the MinIO path (Plan 09, requirement CLOUD-07).
|
||||
|
||||
Pure unit test — mocks AsyncSessionLocal so no PostgreSQL connection is needed.
|
||||
"""
|
||||
from db.models import Document
|
||||
from tasks.document_tasks import _run
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
|
||||
auth = await _create_user_and_token(db_session, role="user")
|
||||
|
||||
# Create a nextcloud document
|
||||
doc_id = _uuid.uuid4()
|
||||
doc = Document(
|
||||
id=doc_id,
|
||||
user_id=auth["user"].id,
|
||||
filename="cloud.pdf",
|
||||
content_type="application/pdf",
|
||||
size_bytes=2048,
|
||||
storage_backend="nextcloud",
|
||||
status="uploaded",
|
||||
object_key="nc_file_id_xyz",
|
||||
)
|
||||
db_session.add(doc)
|
||||
await db_session.commit()
|
||||
user_id = _uuid.uuid4()
|
||||
|
||||
# Mock cloud backend: returns file bytes, enabling extraction to proceed
|
||||
# Build a minimal mock Document and User (no DB)
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.id = doc_id
|
||||
mock_doc.user_id = user_id
|
||||
mock_doc.storage_backend = "nextcloud"
|
||||
mock_doc.object_key = "nc_file_id_xyz"
|
||||
mock_doc.content_type = "application/pdf"
|
||||
mock_doc.filename = "cloud.pdf"
|
||||
mock_doc.status = "uploaded"
|
||||
|
||||
mock_user = MagicMock()
|
||||
mock_user.id = user_id
|
||||
mock_user.ai_provider = None
|
||||
mock_user.ai_model = None
|
||||
|
||||
# Mock cloud backend: returns fake bytes so extraction can proceed
|
||||
mock_cloud_backend = AsyncMock()
|
||||
mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake content")
|
||||
mock_cloud_backend.get_object = AsyncMock(return_value=b"%PDF-1.4 fake")
|
||||
|
||||
# Mock MinIO backend to verify it is NOT called
|
||||
mock_minio_backend = AsyncMock()
|
||||
mock_minio_backend.get_object = AsyncMock(return_value=b"should not be called")
|
||||
|
||||
with patch("tasks.document_tasks.get_storage_backend_for_document", return_value=mock_cloud_backend) as mock_gsb_doc, \
|
||||
patch("tasks.document_tasks.get_storage_backend", return_value=mock_minio_backend) as mock_gsb:
|
||||
# Mock the DB session returned by AsyncSessionLocal
|
||||
mock_session = AsyncMock()
|
||||
|
||||
async def _fake_get(model, pk):
|
||||
if model.__name__ == "Document":
|
||||
return mock_doc
|
||||
if model.__name__ == "User":
|
||||
return mock_user
|
||||
return None
|
||||
|
||||
mock_session.get = _fake_get
|
||||
|
||||
# AsyncSessionLocal is an async context manager; mock it
|
||||
class _FakeSessionCM:
|
||||
async def __aenter__(self):
|
||||
return mock_session
|
||||
async def __aexit__(self, *args):
|
||||
pass
|
||||
|
||||
# Patch at the storage module level (source of the functions used via deferred import)
|
||||
with patch("db.session.AsyncSessionLocal", return_value=_FakeSessionCM()), \
|
||||
patch("storage.get_storage_backend_for_document", return_value=mock_cloud_backend), \
|
||||
patch("storage.get_storage_backend", return_value=mock_minio_backend), \
|
||||
patch("services.extractor.extract_text_from_bytes", return_value="extracted text"), \
|
||||
patch("services.classifier.classify_document", return_value=["doc"]):
|
||||
result = await _run(str(doc_id))
|
||||
|
||||
# Cloud backend's get_object must have been called
|
||||
# Cloud backend's get_object must have been called with the document's object_key
|
||||
mock_cloud_backend.get_object.assert_called_once_with("nc_file_id_xyz")
|
||||
|
||||
# MinIO backend's get_object must NOT have been called
|
||||
mock_minio_backend.get_object.assert_not_called()
|
||||
|
||||
# Result must not be an error from MinIO path
|
||||
assert result.get("status") != "extract_failed" or "MinIO" not in result.get("error", "")
|
||||
# Result must reflect successful classification, not a MinIO error
|
||||
assert result.get("status") in ("classified", "classification_failed"), \
|
||||
f"Expected classified/classification_failed, got: {result}"
|
||||
|
||||
Reference in New Issue
Block a user