feat(phase-4-05): document streaming proxy GET /api/documents/{id}/content (DOC-02)

- Add _parse_range() helper: validates Range header bounds, raises 416 on invalid
- Add stream_document_content endpoint with get_regular_user dep (admin → 403)
- Access check: owner OR Share.recipient_id; neither → 404
- Bytes fetched via get_object() only — presigned_get_url() never called
- Range requests return 206 + Content-Range header
- Add pdf_open_mode column to User ORM model (migration 0004 already applied)
- Use HTTP_416_RANGE_NOT_SATISFIABLE (non-deprecated constant)
This commit is contained in:
curo1305
2026-05-25 18:48:32 +02:00
parent 364447d0bc
commit f868a4e0c7
2 changed files with 99 additions and 1 deletions
+96 -1
View File
@@ -20,7 +20,8 @@ import uuid
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, status
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from sqlalchemy import select, text, func
from sqlalchemy.ext.asyncio import AsyncSession
@@ -395,3 +396,97 @@ async def classify_document(
raise HTTPException(500, f"Classification failed: {e}")
return {"topics": topics}
# ── Range header parsing helper ───────────────────────────────────────────────
def _parse_range(range_header: str, file_size: int) -> tuple:
"""Parse a 'bytes=X-Y' Range header and return (start, end).
Returns (start, end) where both are inclusive byte offsets.
Raises HTTP 416 on any invalid or out-of-bounds range.
T-04-05-03: validates start <= end, start >= 0, end < file_size.
"""
try:
h = range_header.replace("bytes=", "").split("-")
start = int(h[0]) if h[0] != "" else 0
end = int(h[1]) if h[1] != "" else file_size - 1
except (ValueError, IndexError):
raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE)
if start > end or start < 0 or end >= file_size:
raise HTTPException(status.HTTP_416_RANGE_NOT_SATISFIABLE)
return start, end
# ── GET /api/documents/{doc_id}/content ──────────────────────────────────────
@router.get("/{doc_id}/content")
async def stream_document_content(
doc_id: str,
request: Request,
session: AsyncSession = Depends(get_db),
current_user: User = Depends(get_regular_user),
):
"""Stream document bytes directly from MinIO (DOC-02).
T-04-05-01: uses get_regular_user — admin role → 403 (critical security invariant).
T-04-05-02: bytes fetched via get_object() ONLY — presigned_get_url() never called.
T-04-05-03: Range header validated via _parse_range(); invalid range → 416.
T-04-05-04: access gated on ownership OR active Share.recipient_id.
Returns 200 (or 206 for Range requests) with:
Content-Type: doc.content_type
Content-Disposition: inline; filename="<filename>"
Accept-Ranges: bytes
Content-Length: <size>
"""
try:
uid = uuid.UUID(doc_id)
except ValueError:
raise HTTPException(status_code=404, detail="Document not found")
doc = await session.get(Document, uid)
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
# Access control: owner OR share recipient (T-04-05-04)
if doc.user_id != current_user.id:
result = await session.execute(
select(Share).where(
Share.document_id == doc.id,
Share.recipient_id == current_user.id,
)
)
share = result.scalar_one_or_none()
if share is None:
raise HTTPException(status_code=404, detail="Document not found")
# Fetch bytes directly from MinIO — NEVER via presigned URL (T-04-05-02)
file_bytes = await get_storage_backend().get_object(doc.object_key)
file_size = len(file_bytes)
headers = {
"content-type": doc.content_type,
"content-disposition": f'inline; filename="{doc.filename}"',
"accept-ranges": "bytes",
"content-length": str(file_size),
}
range_header = request.headers.get("range")
if range_header:
start, end = _parse_range(range_header, file_size)
chunk = file_bytes[start : end + 1]
headers["content-range"] = f"bytes {start}-{end}/{file_size}"
headers["content-length"] = str(len(chunk))
return StreamingResponse(
iter([chunk]),
status_code=206,
headers=headers,
)
return StreamingResponse(
iter([file_bytes]),
status_code=200,
headers=headers,
)
+3
View File
@@ -61,6 +61,9 @@ class User(Base):
default_storage_backend: Mapped[str] = mapped_column(
String, nullable=False, default="minio"
)
pdf_open_mode: Mapped[str] = mapped_column(
String, nullable=False, server_default="in_app"
)
created_at: Mapped[datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=func.now()
)