From f868a4e0c74ff2e2f6125be333a31b9fc4952bbd Mon Sep 17 00:00:00 2001 From: curo1305 Date: Mon, 25 May 2026 18:48:32 +0200 Subject: [PATCH] feat(phase-4-05): document streaming proxy GET /api/documents/{id}/content (DOC-02) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _parse_range() helper: validates Range header bounds, raises 416 on invalid - Add stream_document_content endpoint with get_regular_user dep (admin → 403) - Access check: owner OR Share.recipient_id; neither → 404 - Bytes fetched via get_object() only — presigned_get_url() never called - Range requests return 206 + Content-Range header - Add pdf_open_mode column to User ORM model (migration 0004 already applied) - Use HTTP_416_RANGE_NOT_SATISFIABLE (non-deprecated constant) --- backend/api/documents.py | 97 +++++++++++++++++++++++++++++++++++++++- backend/db/models.py | 3 ++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/backend/api/documents.py b/backend/api/documents.py index f2df032..69077cf 100644 --- a/backend/api/documents.py +++ b/backend/api/documents.py @@ -20,7 +20,8 @@ import uuid from pathlib import Path from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, Query, status +from fastapi import APIRouter, Depends, HTTPException, Query, Request, status +from fastapi.responses import StreamingResponse from pydantic import BaseModel from sqlalchemy import select, text, func from sqlalchemy.ext.asyncio import AsyncSession @@ -395,3 +396,97 @@ async def classify_document( raise HTTPException(500, f"Classification failed: {e}") return {"topics": topics} + + +# ── Range header parsing helper ─────────────────────────────────────────────── + +def _parse_range(range_header: str, file_size: int) -> tuple: + """Parse a 'bytes=X-Y' Range header and return (start, end). + + Returns (start, end) where both are inclusive byte offsets. + Raises HTTP 416 on any invalid or out-of-bounds range. + + T-04-05-03: validates start <= end, start >= 0, end < file_size. + """ + try: + h = range_header.replace("bytes=", "").split("-") + start = int(h[0]) if h[0] != "" else 0 + end = int(h[1]) if h[1] != "" else file_size - 1 + except (ValueError, IndexError): + raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE) + if start > end or start < 0 or end >= file_size: + raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE) + return start, end + + +# ── GET /api/documents/{doc_id}/content ────────────────────────────────────── + +@router.get("/{doc_id}/content") +async def stream_document_content( + doc_id: str, + request: Request, + session: AsyncSession = Depends(get_db), + current_user: User = Depends(get_regular_user), +): + """Stream document bytes directly from MinIO (DOC-02). + + T-04-05-01: uses get_regular_user — admin role → 403 (critical security invariant). + T-04-05-02: bytes fetched via get_object() ONLY — presigned_get_url() never called. + T-04-05-03: Range header validated via _parse_range(); invalid range → 416. + T-04-05-04: access gated on ownership OR active Share.recipient_id. + + Returns 200 (or 206 for Range requests) with: + Content-Type: doc.content_type + Content-Disposition: inline; filename="" + Accept-Ranges: bytes + Content-Length: + """ + try: + uid = uuid.UUID(doc_id) + except ValueError: + raise HTTPException(status_code=404, detail="Document not found") + + doc = await session.get(Document, uid) + if doc is None: + raise HTTPException(status_code=404, detail="Document not found") + + # Access control: owner OR share recipient (T-04-05-04) + if doc.user_id != current_user.id: + result = await session.execute( + select(Share).where( + Share.document_id == doc.id, + Share.recipient_id == current_user.id, + ) + ) + share = result.scalar_one_or_none() + if share is None: + raise HTTPException(status_code=404, detail="Document not found") + + # Fetch bytes directly from MinIO — NEVER via presigned URL (T-04-05-02) + file_bytes = await get_storage_backend().get_object(doc.object_key) + file_size = len(file_bytes) + + headers = { + "content-type": doc.content_type, + "content-disposition": f'inline; filename="{doc.filename}"', + "accept-ranges": "bytes", + "content-length": str(file_size), + } + + range_header = request.headers.get("range") + if range_header: + start, end = _parse_range(range_header, file_size) + chunk = file_bytes[start : end + 1] + headers["content-range"] = f"bytes {start}-{end}/{file_size}" + headers["content-length"] = str(len(chunk)) + return StreamingResponse( + iter([chunk]), + status_code=206, + headers=headers, + ) + + return StreamingResponse( + iter([file_bytes]), + status_code=200, + headers=headers, + ) diff --git a/backend/db/models.py b/backend/db/models.py index e65841f..5e9db49 100644 --- a/backend/db/models.py +++ b/backend/db/models.py @@ -61,6 +61,9 @@ class User(Base): default_storage_backend: Mapped[str] = mapped_column( String, nullable=False, default="minio" ) + pdf_open_mode: Mapped[str] = mapped_column( + String, nullable=False, server_default="in_app" + ) created_at: Mapped[datetime] = mapped_column( TIMESTAMP(timezone=True), nullable=False, server_default=func.now() )