Files
kite/backend/api/documents.py
T
curo1305 a548266461 refactor(backend): extract shared helper modules per architecture rules
- Add backend/ai/utils.py — parse_classification, parse_suggestions, strip_code_fences
  shared by all AI providers; removes duplicated private functions from
  anthropic_provider.py and openai_provider.py
- Add backend/deps/utils.py — get_client_ip, parse_uuid request-parsing helpers;
  removes local _ip() variants from admin.py, auth.py, shares.py, folders.py
- Add backend/storage/exceptions.py — canonical CloudConnectionError definition;
  all routers and backends import from here instead of redefining
- Move validate_password_strength to backend/services/auth.py; removes duplicated
  _validate_password_strength from admin.py and auth.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 16:10:35 +02:00

832 lines
32 KiB
Python

"""
Document API endpoints for DocuVault — Phase 3 Wave 2 / Phase 5 Plan 06.
Implements the presigned PUT upload flow (D-04, D-05):
POST /api/documents/upload-url — create pending Document row, return presigned URL
POST /api/documents/{id}/confirm — stat MinIO for authoritative size, atomic quota UPDATE
Cloud upload path (D-10, D-14, D-15 — Phase 5 Plan 06):
POST /api/documents/upload — multipart upload with target_backend parameter;
cloud backends bypass presigned URL and use direct put_object()
Preserved endpoints (auth guards added in Plan 03-03):
GET /api/documents — list documents
GET /api/documents/{id} — get document metadata
DELETE /api/documents/{id} — delete document (decrements quota atomically)
POST /api/documents/{id}/classify — reclassify document topics
GET /api/documents/{id}/content — stream document bytes (all backends, Phase 5 Plan 06)
NOTE (Wave 2): No auth guards on any endpoint yet — Plan 03-03 adds get_current_user
to all handlers. The doc.user_id=None guard in /confirm is a Wave 2 placeholder.
"""
from __future__ import annotations
import urllib.parse
import uuid
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request, UploadFile, File, status
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field, field_validator
from sqlalchemy import select, text, func
from sqlalchemy.ext.asyncio import AsyncSession
from config import settings
from db.models import CloudConnection, Document, Folder, Quota, Share, User
from deps.auth import get_regular_user
from deps.db import get_db
from services import classifier, storage
from services.audit import write_audit_log
from storage import get_storage_backend, get_storage_backend_for_document
from storage.cloud_utils import decrypt_credentials
from tasks.document_tasks import extract_and_classify
try:
from minio.error import S3Error
except ImportError:
# Fallback for test environments where minio is not installed
S3Error = Exception # type: ignore[assignment,misc]
from storage.exceptions import CloudConnectionError
# Valid cloud backend slugs (T-05-06-01: validated against allowlist, not user-supplied string)
_CLOUD_PROVIDERS = frozenset({"google_drive", "onedrive", "nextcloud", "webdav"})
router = APIRouter(prefix="/api/documents", tags=["documents"])
# ── Request models ────────────────────────────────────────────────────────────
class UploadUrlRequest(BaseModel):
filename: str
content_type: str
class DocumentPatch(BaseModel):
"""Pydantic model for PATCH /api/documents/{doc_id}.
Optional fields — model_fields_set distinguishes "not provided" from "set to null".
At least one field must be present in model_fields_set (enforced in the handler).
T-05-09-01: explicit field declaration prevents mass assignment.
T-05-09-02: only filename and folder_id are accepted — no other fields can be set.
"""
filename: Optional[str] = Field(None, min_length=1, max_length=255)
folder_id: Optional[uuid.UUID] = None
@field_validator("filename")
@classmethod
def filename_no_path_separators(cls, v: Optional[str]) -> Optional[str]:
if v is not None and ("/" in v or "\\" in v):
raise ValueError("filename must not contain path separators")
return v
# ── POST /api/documents/upload-url ───────────────────────────────────────────
@router.post("/upload-url")
async def request_upload_url(
body: UploadUrlRequest,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Create a pending Document row and return a presigned PUT URL.
D-05 step 1: FastAPI creates a Document row (status='pending'), generates a
15-minute presigned PUT URL, returns {upload_url, document_id}.
Quota is NOT reserved at this step — quota enforcement happens at /confirm.
T-03-04: object_key is computed server-side using str(current_user.id); filename
stored in DB only (CLAUDE.md MinIO key schema).
T-03-15: object_key prefix is always the authenticated user's id — never user-supplied.
"""
doc_id = uuid.uuid4()
suffix = Path(body.filename).suffix.lower()
object_key = f"{current_user.id}/{doc_id}/{uuid.uuid4()}{suffix}"
doc = Document(
id=doc_id,
user_id=current_user.id,
filename=body.filename,
content_type=body.content_type,
size_bytes=0,
storage_backend="minio",
status="pending",
object_key=object_key,
)
session.add(doc)
await session.commit()
upload_url = await get_storage_backend().generate_presigned_put_url(
object_key, expires_minutes=15
)
return {"upload_url": upload_url, "document_id": str(doc_id)}
# ── POST /api/documents/upload ────────────────────────────────────────────────
@router.post("/upload")
async def upload_document(
file: UploadFile = File(...),
target_backend: str = Form("minio"),
cloud_folder_path: str = Form(None),
request: Request = None,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Direct multipart upload endpoint supporting cloud backends (D-10, D-14, D-15).
If target_backend == "minio": generates a presigned PUT URL (unchanged MinIO flow).
If target_backend in ("google_drive", "onedrive", "nextcloud", "webdav"):
1. Reads file bytes from UploadFile
2. Loads CloudConnection for current_user.id + target_backend; 404 if not found/not ACTIVE
3. Decrypts credentials and instantiates the correct backend class
4. Calls cloud_backend.put_object() to upload directly to the provider
5. Creates Document with storage_backend=target_backend
6. Returns {document_id, storage_backend} — no upload_url (cloud upload is synchronous)
Cloud uploads do NOT use the atomic quota UPDATE — cloud files are not counted
against MinIO quota (D-11: separate backends; cloud storage quota is provider-side).
Security:
T-05-06-01: target_backend validated against _CLOUD_PROVIDERS allowlist → 422 on invalid value
T-05-06-02: CloudConnectionError detail message never includes provider error detail
"""
if target_backend == "minio":
# MinIO: generate a presigned URL for client-side PUT (existing flow reused)
doc_id = uuid.uuid4()
suffix = Path(file.filename or "file").suffix.lower()
object_key = f"{current_user.id}/{doc_id}/{uuid.uuid4()}{suffix}"
doc = Document(
id=doc_id,
user_id=current_user.id,
filename=file.filename or "upload",
content_type=file.content_type or "application/octet-stream",
size_bytes=0,
storage_backend="minio",
status="pending",
object_key=object_key,
)
session.add(doc)
await session.commit()
upload_url = await get_storage_backend().generate_presigned_put_url(
object_key, expires_minutes=15
)
return {"upload_url": upload_url, "document_id": str(doc_id)}
# Cloud backend path
if target_backend not in _CLOUD_PROVIDERS:
raise HTTPException(
status_code=422,
detail=f"Invalid target_backend '{target_backend}'. Valid values: minio, {', '.join(sorted(_CLOUD_PROVIDERS))}",
)
# Load active CloudConnection for current user + provider (T-05-06-01: user-scoped query)
result = await session.execute(
select(CloudConnection).where(
CloudConnection.user_id == current_user.id,
CloudConnection.provider == target_backend,
CloudConnection.status == "ACTIVE",
)
)
conn = result.scalar_one_or_none()
if conn is None:
raise HTTPException(
status_code=404,
detail=f"No active {target_backend} connection found. Please connect in Settings.",
)
# Decrypt per-user credentials
master_key = settings.cloud_creds_key.encode()
credentials = decrypt_credentials(master_key, str(current_user.id), conn.credentials_enc)
# Read file bytes
file_bytes = await file.read()
filename = file.filename or "upload"
content_type = file.content_type or "application/octet-stream"
extension = Path(filename).suffix.lower()
doc_id = uuid.uuid4()
# Instantiate backend and upload
if target_backend == "google_drive":
from storage.google_drive_backend import GoogleDriveBackend # lazy import
cloud_backend = GoogleDriveBackend(credentials)
elif target_backend == "onedrive":
from storage.onedrive_backend import OneDriveBackend # lazy import
cloud_backend = OneDriveBackend(credentials)
elif target_backend == "nextcloud":
from storage.nextcloud_backend import NextcloudBackend # lazy import
cloud_backend = NextcloudBackend(
credentials["server_url"],
credentials["username"],
credentials["password"],
)
elif target_backend == "webdav":
from storage.webdav_backend import WebDAVBackend # lazy import
cloud_backend = WebDAVBackend(
credentials["server_url"],
credentials["username"],
credentials["password"],
)
try:
object_key = await cloud_backend.put_object(
str(current_user.id),
str(doc_id),
file_bytes,
extension,
content_type,
cloud_folder=cloud_folder_path or None,
original_filename=filename if cloud_folder_path else None,
)
except CloudConnectionError as exc:
raise HTTPException(
status_code=503,
detail="Cloud connection requires re-authentication. Please reconnect in Settings.",
) from exc
# Bust folder listing cache so the next GET /folders reflects the new file
if cloud_folder_path:
from services.cloud_cache import invalidate_provider_cache # lazy import
invalidate_provider_cache(str(current_user.id), target_backend)
doc = Document(
id=doc_id,
user_id=current_user.id,
filename=filename,
content_type=content_type,
size_bytes=len(file_bytes),
storage_backend=target_backend,
status="uploaded",
object_key=object_key,
)
session.add(doc)
_ip = (
request.headers.get("X-Forwarded-For") or (request.client.host if request.client else None)
) if request else None
await write_audit_log(
session,
event_type="document.uploaded",
user_id=current_user.id,
actor_id=current_user.id,
resource_id=doc.id,
ip_address=_ip,
metadata_={"size_bytes": len(file_bytes), "storage_backend": target_backend},
)
await session.commit()
extract_and_classify.delay(str(doc.id))
return {"document_id": str(doc.id), "storage_backend": target_backend}
# ── POST /api/documents/{doc_id}/confirm ─────────────────────────────────────
@router.post("/{doc_id}/confirm")
async def confirm_upload(
doc_id: str,
request: Request,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Confirm a presigned PUT upload: stat MinIO for size, enforce quota atomically.
D-05 step 3: FastAPI reads authoritative file size from MinIO stat_object (never
from client), runs atomic quota UPDATE, sets status='uploaded', enqueues Celery task.
Quota exceeded: HTTP 413 with {"used_bytes": N, "limit_bytes": M, "rejected_bytes": K}
Upload not found: HTTP 422 (presigned URL may have expired)
T-03-05: size always comes from backend.stat_object(doc.object_key) — never client.
T-03-06: atomic SQL UPDATE prevents concurrent over-quota uploads (STORE-03 SC2).
T-03-11: ownership assertion — cross-user access returns 404 (D-16).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(status_code=404, detail="Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(status_code=404, detail="Document not found")
# Get authoritative file size from MinIO (T-03-05 — never trust client-supplied size)
try:
size = await get_storage_backend().stat_object(doc.object_key)
except Exception as exc:
code = getattr(exc, "code", "")
if code == "NoSuchKey":
raise HTTPException(
status_code=422,
detail="Upload not found — presigned URL may have expired",
)
raise HTTPException(status_code=502, detail=f"Storage error: {exc}")
doc.size_bytes = size
await session.flush()
# Atomic quota enforcement — user_id is always set post-migration (Plan 03-03+)
result = await session.execute(
text(
"UPDATE quotas "
"SET used_bytes = used_bytes + :delta "
"WHERE user_id = :uid "
" AND (used_bytes + :delta) <= limit_bytes "
"RETURNING used_bytes, limit_bytes"
),
{"delta": size, "uid": doc.user_id.hex},
)
row = result.fetchone()
if row is None:
# Quota exceeded — fetch current quota state for the 413 body
quota_result = await session.execute(
text("SELECT used_bytes, limit_bytes FROM quotas WHERE user_id = :uid"),
{"uid": doc.user_id.hex},
)
q = quota_result.fetchone()
# Delete the pending Document row and best-effort remove the MinIO object
await session.delete(doc)
try:
await get_storage_backend().delete_object(doc.object_key)
except Exception:
pass # MinIO cleanup is best-effort; object TTL will eventually expire
await session.commit()
raise HTTPException(
status_code=413,
detail={
"used_bytes": q.used_bytes if q else 0,
"limit_bytes": q.limit_bytes if q else 0,
"rejected_bytes": size,
},
)
used_bytes = row.used_bytes
doc.status = "uploaded"
# D-13: document uploaded event — size_bytes + storage_backend only, NO filename, NO extracted_text (T-04-07-02)
# TRUST BOUNDARY: X-Forwarded-For is client-controlled — for audit logging only,
# not for auth/access control. Use a trusted reverse proxy in production to
# overwrite this header with the real remote IP before it reaches FastAPI.
_ip = request.headers.get("X-Forwarded-For") or (request.client.host if request.client else None)
await write_audit_log(
session,
event_type="document.uploaded",
user_id=current_user.id,
actor_id=current_user.id,
resource_id=doc.id,
ip_address=_ip,
metadata_={"size_bytes": size, "storage_backend": "minio"},
)
await session.commit()
extract_and_classify.delay(str(doc.id))
return {
"id": str(doc.id),
"size_bytes": size,
"used_bytes": used_bytes,
"status": "uploaded",
}
# ── GET /api/documents ────────────────────────────────────────────────────────
@router.get("")
async def list_documents(
topic: Optional[str] = Query(None),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
sort: str = Query("date"),
order: str = Query("desc"),
folder_id: Optional[str] = Query(None),
q: Optional[str] = Query(None),
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""List documents with optional sort, folder filter, and full-text search.
D-16: requires authenticated regular user (get_regular_user rejects admins).
Returns only documents belonging to the current user.
FOLD-05: sort by name|date|size; order asc|desc; folder_id filter;
q full-text search via plainto_tsquery (PostgreSQL only — silently skipped
on SQLite when function is unavailable). FTS scope is always scoped to
current_user.id (T-04-03-02).
Backward-compat: when sort/order/folder_id/q are not provided, behaviour
is identical to the pre-Phase-4 implementation.
"""
# If no new params used, fall through to the legacy storage.list_metadata path
# to preserve full backward compatibility with topic filtering.
if folder_id is None and q is None and sort == "date" and order == "desc":
docs = await storage.list_metadata(session, user_id=current_user.id, topic=topic)
total = len(docs)
start = (page - 1) * per_page
# Add is_shared field (Phase 4 addition)
shared_result = await session.execute(
select(Share.document_id).where(Share.owner_id == current_user.id)
)
shared_ids = {row[0] for row in shared_result.fetchall()}
items = []
for d in docs[start : start + per_page]:
doc_id_str = d.get("id", "")
try:
doc_uuid = uuid.UUID(doc_id_str)
except (ValueError, AttributeError):
doc_uuid = None
d["is_shared"] = doc_uuid in shared_ids if doc_uuid else False
items.append(d)
return {"items": items, "total": total, "page": page, "per_page": per_page}
# New path: direct ORM query with sort/filter/FTS
from db.models import DocumentTopic, Topic # noqa: PLC0415 (avoid circular at module top)
stmt = select(Document).where(Document.user_id == current_user.id)
# Topic filter (join-based, same as list_metadata)
if topic is not None:
stmt = (
stmt.join(DocumentTopic, DocumentTopic.document_id == Document.id)
.join(Topic, Topic.id == DocumentTopic.topic_id)
.where(Topic.name == topic)
)
# Folder filter
if folder_id is not None:
try:
folder_uuid = uuid.UUID(folder_id)
except ValueError:
raise HTTPException(status_code=404, detail="Folder not found")
stmt = stmt.where(Document.folder_id == folder_uuid)
# Sort
sort_col = Document.created_at # default: date
if sort == "name":
sort_col = Document.filename
elif sort == "size":
sort_col = Document.size_bytes
order_fn = sort_col.asc if order == "asc" else sort_col.desc
stmt = stmt.order_by(order_fn())
# Full-text search — plainto_tsquery on extracted_text (PostgreSQL only)
# Falls back to unfiltered if the DB dialect doesn't support @@ (e.g. SQLite in test env)
fts_requested = q is not None and len(q) >= 2
if fts_requested:
fts_stmt = stmt.where(
func.to_tsvector("english", func.coalesce(Document.extracted_text, "")).op("@@")(
func.plainto_tsquery("english", q)
)
)
try:
result = await session.execute(fts_stmt)
except Exception:
result = await session.execute(stmt)
else:
result = await session.execute(stmt)
docs_orm = result.scalars().all()
# is_shared subquery
shared_result = await session.execute(
select(Share.document_id).where(Share.owner_id == current_user.id)
)
shared_ids = {row[0] for row in shared_result.fetchall()}
# Serialize
all_items = []
for doc in docs_orm:
from services.storage import _doc_to_dict, _load_topic_names # noqa: PLC0415
topic_names = await _load_topic_names(session, doc.id)
d = _doc_to_dict(doc, topic_names)
d["is_shared"] = doc.id in shared_ids
all_items.append(d)
total = len(all_items)
start = (page - 1) * per_page
return {
"items": all_items[start : start + per_page],
"total": total,
"page": page,
"per_page": per_page,
}
# ── GET /api/documents/{doc_id} ───────────────────────────────────────────────
@router.get("/{doc_id}")
async def get_document(
doc_id: str,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Return document metadata by ID.
D-16: requires authenticated regular user. Asserts ownership — cross-user
access returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None:
raise HTTPException(404, "Document not found")
is_recipient = False
if doc.user_id != current_user.id:
# Allow recipients of an active share to view the document
share_result = await session.execute(
select(Share).where(
Share.document_id == uid,
Share.recipient_id == current_user.id,
)
)
if share_result.scalar_one_or_none() is None:
raise HTTPException(404, "Document not found")
is_recipient = True
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
# T-04-04-03: recipients get metadata only — extracted_text excluded (consistent with /shares/received)
if is_recipient:
meta.pop("extracted_text", None)
return meta
# ── PATCH /api/documents/{doc_id} ────────────────────────────────────────────
@router.patch("/{doc_id}")
async def patch_document(
doc_id: str,
body: DocumentPatch,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Update document metadata (filename and/or folder_id).
T-05-09-01: get_regular_user dep rejects admins (403) and unauthenticated (401).
T-05-09-01: ownership check — non-owner gets 404 to avoid leaking document IDs (D-16).
T-05-09-02: response uses storage.get_metadata() which excludes credentials_enc and
password_hash via the _doc_to_dict whitelist.
At least one field must be provided — empty body returns 422.
folder_id=null moves the document to the root (no folder).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
# Require at least one field to be set (model_fields_set tracks provided fields)
if not body.model_fields_set:
raise HTTPException(422, "At least one field (filename, folder_id) must be provided")
if "filename" in body.model_fields_set and body.filename is not None:
doc.filename = body.filename
if "folder_id" in body.model_fields_set:
# folder_id=null → move to root (no folder); folder_id=<uuid> → move to folder
if body.folder_id is not None:
target = await session.get(Folder, body.folder_id)
if target is None or target.user_id != current_user.id:
raise HTTPException(404, "Folder not found")
doc.folder_id = body.folder_id
await session.commit()
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
return meta
# ── DELETE /api/documents/{doc_id} ───────────────────────────────────────────
@router.delete("/{doc_id}")
async def delete_document(
doc_id: str,
request: Request,
remove_only: bool = Query(default=False),
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Delete a document and decrement quota atomically.
For cloud-stored documents:
- Default path: attempt cloud provider delete first; on failure return
{success: false, cloud_delete_failed: true} (HTTP 200) so the frontend
can offer a "Remove from app" fallback (T-06.2-03-02).
- remove_only=true: skip cloud delete, remove DB row only, skip quota decrement.
- Cloud docs always use skip_quota=True (never charged MinIO quota, T-06.2-03-01).
D-16: requires authenticated regular user. Asserts ownership — cross-user
delete returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
is_cloud = doc.storage_backend != "minio"
_doc_size = doc.size_bytes
_doc_id = doc.id
# TRUST BOUNDARY: X-Forwarded-For is client-controlled — for audit logging only,
# not for auth/access control. Use a trusted reverse proxy in production to
# overwrite this header with the real remote IP before it reaches FastAPI.
_ip = request.headers.get("X-Forwarded-For") or (request.client.host if request.client else None)
# Cloud routing: attempt provider delete unless remove_only is set
if is_cloud and not remove_only:
try:
cloud_backend = await get_storage_backend_for_document(doc, current_user, session)
await cloud_backend.delete_object(doc.object_key)
except Exception as exc:
import sys
print(f"[cloud-delete] provider error: {exc}", file=sys.stderr)
return JSONResponse(
status_code=200,
content={
"success": False,
"cloud_delete_failed": True,
"detail": "Cloud provider delete failed. You can remove from app only.",
},
)
# auto_commit=False defers the commit so the audit log write below happens
# in the same transaction — avoids the split-transaction gap (WR-08).
ok = await storage.delete_document(session, doc_id, skip_quota=is_cloud, auto_commit=False)
if not ok:
raise HTTPException(404, "Document not found")
# D-13: document deleted event — written in the same transaction as the delete (WR-08).
await write_audit_log(
session,
event_type="document.deleted",
user_id=current_user.id,
actor_id=current_user.id,
resource_id=_doc_id,
ip_address=_ip,
metadata_={"size_bytes": _doc_size},
)
await session.commit()
return {"success": True}
# ── POST /api/documents/{doc_id}/classify ────────────────────────────────────
@router.post("/{doc_id}/classify")
async def classify_document(
doc_id: str,
body: dict = {},
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Reclassify a document's topics on demand.
D-16: requires authenticated regular user. Asserts ownership — cross-user
classify returns 404 (not 403) to avoid information leakage (T-03-11).
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(404, "Document not found")
doc = await session.get(Document, uid)
if doc is None or doc.user_id != current_user.id:
raise HTTPException(404, "Document not found")
topic_names = body.get("topics") if body else None
try:
topics = await classifier.classify_document(session, doc_id, topic_names)
except Exception as e:
raise HTTPException(500, f"Classification failed: {e}")
return {"topics": topics}
# ── Range header parsing helper ───────────────────────────────────────────────
def _parse_range(range_header: str, file_size: int) -> tuple:
"""Parse a 'bytes=X-Y' Range header and return (start, end).
Returns (start, end) where both are inclusive byte offsets.
Raises HTTP 416 on any invalid or out-of-bounds range.
T-04-05-03: validates start <= end, start >= 0, end < file_size.
"""
try:
h = range_header.replace("bytes=", "").split("-")
start = int(h[0]) if h[0] != "" else 0
end = int(h[1]) if h[1] != "" else file_size - 1
except (ValueError, IndexError):
raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE)
if start > end or start < 0 or end >= file_size:
raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE)
return start, end
# ── GET /api/documents/{doc_id}/content ──────────────────────────────────────
@router.get("/{doc_id}/content")
async def stream_document_content(
doc_id: str,
request: Request,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Stream document bytes directly from MinIO (DOC-02).
T-04-05-01: uses get_regular_user — admin role → 403 (critical security invariant).
T-04-05-02: bytes fetched via get_object() ONLY — presigned_get_url() never called.
T-04-05-03: Range header validated via _parse_range(); invalid range → 416.
T-04-05-04: access gated on ownership OR active Share.recipient_id.
Returns 200 (or 206 for Range requests) with:
Content-Type: doc.content_type
Content-Disposition: inline; filename="<filename>"
Accept-Ranges: bytes
Content-Length: <size>
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(status_code=404, detail="Document not found")
doc = await session.get(Document, uid)
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
# Access control: owner OR share recipient (T-04-05-04)
if doc.user_id != current_user.id:
result = await session.execute(
select(Share).where(
Share.document_id == doc.id,
Share.recipient_id == current_user.id,
)
)
share = result.scalar_one_or_none()
if share is None:
raise HTTPException(status_code=404, detail="Document not found")
# Fetch bytes from the correct backend — get_storage_backend_for_document handles
# all backends (MinIO, Google Drive, OneDrive, Nextcloud, WebDAV) transparently
# (D-15, T-04-05-02). NEVER via presigned URL for cloud backends (D-14).
try:
storage_backend = await get_storage_backend_for_document(doc, current_user, session)
file_bytes = await storage_backend.get_object(doc.object_key)
except CloudConnectionError as exc:
raise HTTPException(
status_code=503,
detail="Cloud connection requires re-authentication. Please reconnect in Settings.",
) from exc
except HTTPException:
raise
except Exception as exc:
raise HTTPException(
status_code=502,
detail="Cloud backend unreachable. Please try again or reconnect in Settings.",
) from exc
file_size = len(file_bytes)
safe_name = urllib.parse.quote(doc.filename, safe='')
headers = {
"content-type": doc.content_type,
"content-disposition": f"inline; filename*=UTF-8''{safe_name}",
"accept-ranges": "bytes",
"content-length": str(file_size),
}
range_header = request.headers.get("range")
if range_header:
start, end = _parse_range(range_header, file_size)
chunk = file_bytes[start : end + 1]
headers["content-range"] = f"bytes {start}-{end}/{file_size}"
headers["content-length"] = str(len(chunk))
return StreamingResponse(
iter([chunk]),
status_code=206,
headers=headers,
)
return StreamingResponse(
iter([file_bytes]),
status_code=200,
headers=headers,
)