feat: migrate doc-service to use storage-service for file I/O (Phase 2)

- storage.py: replace aiofiles filesystem ops with httpx calls to
  storage-service PUT/GET/DELETE /objects/documents/{key}
- Document model: rename file_path → storage_key (plain object key, no path prefix)
- Migration 0008: ALTER COLUMN + data migration strips /data/documents/ prefix
- documents.py: update upload, delete, download endpoints; _extract_pdf_text
  now takes bytes (pdfplumber.open(BytesIO)) instead of a filesystem path
- file_watcher.py: store storage_key instead of file_path on ingestion
- doc-service config: add STORAGE_SERVICE_URL env var

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-20 15:57:29 +02:00
parent 5349f21752
commit 2f3efb9bf9
6 changed files with 128 additions and 36 deletions
@@ -0,0 +1,56 @@
"""rename file_path to storage_key and strip filesystem prefix from existing rows
Revision ID: 0008
Revises: 0007
Create Date: 2026-04-20
Renames the documents.file_path column to storage_key.
Existing rows have paths like '/data/documents/{user_id}/{doc_id}.pdf' or
'/data/documents/watch/{doc_id}.pdf'. The migration strips the leading
'/data/documents/' prefix so the value becomes a plain storage key
(e.g. '{user_id}/{doc_id}.pdf') that the storage-service uses as the object key.
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
revision: str = "0008"
down_revision: Union[str, None] = "0007"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
with op.batch_alter_table("documents") as batch_op:
batch_op.alter_column(
"file_path",
new_column_name="storage_key",
existing_type=sa.String(),
existing_nullable=False,
)
# Strip the '/data/documents/' filesystem prefix from pre-migration rows.
op.execute(
sa.text(
"UPDATE documents SET storage_key = REPLACE(storage_key, '/data/documents/', '')"
" WHERE storage_key LIKE '/data/documents/%'"
)
)
def downgrade() -> None:
# Restore the filesystem prefix so old code can still find the files.
op.execute(
sa.text(
"UPDATE documents SET storage_key = '/data/documents/' || storage_key"
" WHERE storage_key NOT LIKE '/data/documents/%'"
)
)
with op.batch_alter_table("documents") as batch_op:
batch_op.alter_column(
"storage_key",
new_column_name="file_path",
existing_type=sa.String(),
existing_nullable=False,
)