diff --git a/backend/migrations/versions/0004_phase4_pdf_open_mode_tsvector.py b/backend/migrations/versions/0004_phase4_pdf_open_mode_tsvector.py new file mode 100644 index 0000000..f1aab00 --- /dev/null +++ b/backend/migrations/versions/0004_phase4_pdf_open_mode_tsvector.py @@ -0,0 +1,85 @@ +"""Phase 4 schema additions: pdf_open_mode column, GIN FTS index, audit-logs bucket. + +Revision ID: 0004 +Revises: 0003 +Create Date: 2026-05-25 + +Changes (in order — see upgrade() for numbered sections): + 1. Add users.pdf_open_mode column (server_default='in_app') + 2. Create GIN expression index ix_documents_fts on documents.extracted_text + 3. Create audit-logs MinIO bucket (gated on MINIO_ENDPOINT env var) + +Note on GIN index (T-04-02-01 mitigated): + Index is created via raw SQL op.execute() rather than Alembic Index() to prevent + Alembic's autogenerate from attempting to recreate the expression index on every + `alembic revision --autogenerate` run (Alembic issue #1390). A comment marks it + as manually managed. + +Note on MinIO bucket creation (T-04-02-02 mitigated): + Bucket creation is gated on MINIO_ENDPOINT env var so SQLite test runs are + unaffected. Credentials come exclusively from env vars — none are hardcoded. + MinIO default bucket policy is private; no public-access policy is set. + +Note on downgrade(): + The GIN index and pdf_open_mode column are reversed. The audit-logs MinIO bucket + is NOT deleted — it may contain audit data. +""" +from __future__ import annotations + +import os + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "0004" +down_revision = "0003" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ── 1. Add users.pdf_open_mode column ──────────────────────────────────── + # batch_alter_table is required for SQLite ALTER TABLE support (transparent + # pass-through on PostgreSQL — no behavioral difference on production DB). + with op.batch_alter_table("users") as batch_op: + batch_op.add_column( + sa.Column( + "pdf_open_mode", + sa.String(), + nullable=False, + server_default="in_app", + ) + ) + + # ── 2. Create GIN expression index on documents.extracted_text ─────────── + # managed manually — do not autogenerate (Alembic issue #1390) + op.execute( + "CREATE INDEX ix_documents_fts ON documents " + "USING GIN (to_tsvector('english', coalesce(extracted_text, '')))" + ) + + # ── 3. Create audit-logs MinIO bucket ──────────────────────────────────── + # Skipped if MINIO_ENDPOINT is not set (SQLite test compatibility — T-04-02-02). + if os.environ.get("MINIO_ENDPOINT"): + from minio import Minio # deferred import — only needed when MinIO is configured + + client = Minio( + os.environ.get("MINIO_ENDPOINT", "minio:9000"), + access_key=os.environ.get("MINIO_ACCESS_KEY", ""), + secret_key=os.environ.get("MINIO_SECRET_KEY", ""), + secure=False, + ) + if not client.bucket_exists("audit-logs"): + client.make_bucket("audit-logs") + + +def downgrade() -> None: + # ── 1. Drop GIN expression index ───────────────────────────────────────── + op.execute("DROP INDEX IF EXISTS ix_documents_fts") + + # ── 2. Drop users.pdf_open_mode column ─────────────────────────────────── + with op.batch_alter_table("users") as batch_op: + batch_op.drop_column("pdf_open_mode") + + # MinIO bucket NOT reversed — bucket may contain audit data diff --git a/backend/storage/minio_backend.py b/backend/storage/minio_backend.py index 516d060..cc8ba9f 100644 --- a/backend/storage/minio_backend.py +++ b/backend/storage/minio_backend.py @@ -85,6 +85,28 @@ class MinIOBackend(StorageBackend): ) return object_key + async def put_object_raw( + self, + bucket: str, + key: str, + data: io.BytesIO, + length: int, + content_type: str, + ) -> None: + """Upload bytes to an arbitrary bucket+key (used for audit-logs CSV export). + + Unlike put_object(), does NOT apply the document key schema — the caller + supplies the complete key. The main documents bucket is NOT used. + """ + await asyncio.to_thread( + self._client.put_object, + bucket, + key, + data, + length=length, + content_type=content_type, + ) + async def get_object(self, object_key: str) -> bytes: """Fetch object bytes from MinIO by key."""