feat: migrate doc-service to use storage-service for file I/O (Phase 2)
- storage.py: replace aiofiles filesystem ops with httpx calls to
storage-service PUT/GET/DELETE /objects/documents/{key}
- Document model: rename file_path → storage_key (plain object key, no path prefix)
- Migration 0008: ALTER COLUMN + data migration strips /data/documents/ prefix
- documents.py: update upload, delete, download endpoints; _extract_pdf_text
now takes bytes (pdfplumber.open(BytesIO)) instead of a filesystem path
- file_watcher.py: store storage_key instead of file_path on ingestion
- doc-service config: add STORAGE_SERVICE_URL env var
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import math
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import aiofiles
|
||||
import pdfplumber
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, UploadFile
|
||||
from fastapi.responses import StreamingResponse
|
||||
@@ -29,7 +29,7 @@ from app.schemas.document import (
|
||||
from app.schemas.share import DocumentShareCreate, DocumentShareOut, SharedDocumentOut
|
||||
from app.services.ai_client import AIServiceError, classify_document
|
||||
from app.services.config_reader import load_doc_config
|
||||
from app.services.storage import delete_file, get_upload_path, save_upload
|
||||
from app.services.storage import delete_file, download_file, save_upload
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -118,10 +118,10 @@ def _doc_with_categories(
|
||||
)
|
||||
|
||||
|
||||
def _extract_pdf_text(file_path: str) -> str:
|
||||
def _extract_pdf_text(pdf_bytes: bytes) -> str:
|
||||
"""Synchronous — must be called via asyncio.to_thread."""
|
||||
text_parts = []
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
@@ -146,7 +146,8 @@ async def process_document(doc_id: str) -> None:
|
||||
await db.commit()
|
||||
|
||||
try:
|
||||
text = await asyncio.to_thread(_extract_pdf_text, doc.file_path)
|
||||
pdf_bytes = await download_file(doc.storage_key)
|
||||
text = await asyncio.to_thread(_extract_pdf_text, pdf_bytes)
|
||||
result = await classify_document(text)
|
||||
|
||||
doc.raw_text = text[:500_000] # cap stored text at 500k chars
|
||||
@@ -187,13 +188,13 @@ async def upload_document(
|
||||
)
|
||||
|
||||
doc_id = str(uuid.uuid4())
|
||||
dest = await save_upload(file_data, user_id, doc_id)
|
||||
storage_key = await save_upload(file_data, user_id, doc_id)
|
||||
|
||||
doc = Document(
|
||||
id=doc_id,
|
||||
user_id=user_id,
|
||||
filename=file.filename or "upload.pdf",
|
||||
file_path=str(dest),
|
||||
storage_key=storage_key,
|
||||
file_size=len(file_data),
|
||||
status="pending",
|
||||
)
|
||||
@@ -578,7 +579,7 @@ async def delete_document(
|
||||
if not can_delete_via_share and not can_delete_as_group_admin:
|
||||
raise HTTPException(status_code=403, detail="Not allowed to delete this document")
|
||||
|
||||
delete_file(doc.file_path)
|
||||
await delete_file(doc.storage_key)
|
||||
await db.delete(doc)
|
||||
await db.commit()
|
||||
|
||||
@@ -609,13 +610,13 @@ async def download_file(
|
||||
if doc is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
async def file_generator():
|
||||
async with aiofiles.open(doc.file_path, "rb") as f:
|
||||
while chunk := await f.read(64 * 1024):
|
||||
yield chunk
|
||||
try:
|
||||
pdf_bytes = await download_file(doc.storage_key)
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="File not found in storage")
|
||||
|
||||
return StreamingResponse(
|
||||
file_generator(),
|
||||
iter([pdf_bytes]),
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f'inline; filename="{doc.filename}"'},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user