feat: migrate doc-service to use storage-service for file I/O (Phase 2)

- storage.py: replace aiofiles filesystem ops with httpx calls to
  storage-service PUT/GET/DELETE /objects/documents/{key}
- Document model: rename file_path → storage_key (plain object key, no path prefix)
- Migration 0008: ALTER COLUMN + data migration strips /data/documents/ prefix
- documents.py: update upload, delete, download endpoints; _extract_pdf_text
  now takes bytes (pdfplumber.open(BytesIO)) instead of a filesystem path
- file_watcher.py: store storage_key instead of file_path on ingestion
- doc-service config: add STORAGE_SERVICE_URL env var

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-20 15:57:29 +02:00
parent 5349f21752
commit 2f3efb9bf9
6 changed files with 128 additions and 36 deletions
+14 -13
View File
@@ -1,10 +1,10 @@
import asyncio
import io
import json
import math
import uuid
from datetime import datetime, timezone
import aiofiles
import pdfplumber
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, UploadFile
from fastapi.responses import StreamingResponse
@@ -29,7 +29,7 @@ from app.schemas.document import (
from app.schemas.share import DocumentShareCreate, DocumentShareOut, SharedDocumentOut
from app.services.ai_client import AIServiceError, classify_document
from app.services.config_reader import load_doc_config
from app.services.storage import delete_file, get_upload_path, save_upload
from app.services.storage import delete_file, download_file, save_upload
router = APIRouter()
@@ -118,10 +118,10 @@ def _doc_with_categories(
)
def _extract_pdf_text(file_path: str) -> str:
def _extract_pdf_text(pdf_bytes: bytes) -> str:
"""Synchronous — must be called via asyncio.to_thread."""
text_parts = []
with pdfplumber.open(file_path) as pdf:
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
@@ -146,7 +146,8 @@ async def process_document(doc_id: str) -> None:
await db.commit()
try:
text = await asyncio.to_thread(_extract_pdf_text, doc.file_path)
pdf_bytes = await download_file(doc.storage_key)
text = await asyncio.to_thread(_extract_pdf_text, pdf_bytes)
result = await classify_document(text)
doc.raw_text = text[:500_000] # cap stored text at 500k chars
@@ -187,13 +188,13 @@ async def upload_document(
)
doc_id = str(uuid.uuid4())
dest = await save_upload(file_data, user_id, doc_id)
storage_key = await save_upload(file_data, user_id, doc_id)
doc = Document(
id=doc_id,
user_id=user_id,
filename=file.filename or "upload.pdf",
file_path=str(dest),
storage_key=storage_key,
file_size=len(file_data),
status="pending",
)
@@ -578,7 +579,7 @@ async def delete_document(
if not can_delete_via_share and not can_delete_as_group_admin:
raise HTTPException(status_code=403, detail="Not allowed to delete this document")
delete_file(doc.file_path)
await delete_file(doc.storage_key)
await db.delete(doc)
await db.commit()
@@ -609,13 +610,13 @@ async def download_file(
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
async def file_generator():
async with aiofiles.open(doc.file_path, "rb") as f:
while chunk := await f.read(64 * 1024):
yield chunk
try:
pdf_bytes = await download_file(doc.storage_key)
except FileNotFoundError:
raise HTTPException(status_code=404, detail="File not found in storage")
return StreamingResponse(
file_generator(),
iter([pdf_bytes]),
media_type="application/pdf",
headers={"Content-Disposition": f'inline; filename="{doc.filename}"'},
)