From 6fed5ba531ba48143c581aa90cd1a6a7e81dba34 Mon Sep 17 00:00:00 2001 From: curo1305 Date: Fri, 22 May 2026 08:49:36 +0200 Subject: [PATCH] =?UTF-8?q?docs(01):=20create=20phase=201=20plan=20?= =?UTF-8?q?=E2=80=94=205=20plans=20in=204=20waves?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Research, pattern mapping, and verification complete. Walking Skeleton mode active (MVP Phase 1). Co-Authored-By: Claude Sonnet 4.6 --- .planning/ROADMAP.md | 7 +- .planning/STATE.md | 8 +- .../01-01-PLAN.md | 261 ++++ .../01-02-PLAN.md | 292 +++++ .../01-03-PLAN.md | 357 ++++++ .../01-04-PLAN.md | 360 ++++++ .../01-05-PLAN.md | 604 +++++++++ .../01-PATTERNS.md | 1082 ++++++++++++++++ .../01-RESEARCH.md | 1104 +++++++++++++++++ .../01-VALIDATION.md | 82 ++ .../01-infrastructure-foundation/SKELETON.md | 55 + 11 files changed, 4207 insertions(+), 5 deletions(-) create mode 100644 .planning/phases/01-infrastructure-foundation/01-01-PLAN.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-02-PLAN.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-03-PLAN.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-04-PLAN.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-05-PLAN.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-PATTERNS.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-RESEARCH.md create mode 100644 .planning/phases/01-infrastructure-foundation/01-VALIDATION.md create mode 100644 .planning/phases/01-infrastructure-foundation/SKELETON.md diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 2f0cc33..ee0e823 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -26,7 +26,12 @@ _Last updated: 2026-05-21_ 3. The full existing document upload, text extraction, and AI classification workflow completes successfully — no regression in single-user behavior 4. MinIO object key schema `{user_id}/{document_id}/{uuid4()}{ext}` is enforced in the model layer; human-readable filenames are stored in the DB column, not in the MinIO key -**Plans**: TBD +**Plans**: 5 plans +- [ ] 01-01-PLAN.md — Docker Compose service topology + Postgres init + Pydantic Settings + requirements +- [ ] 01-02-PLAN.md — Wave 0 test scaffolds (xfail/skip stubs) + async pytest fixtures +- [ ] 01-03-PLAN.md — SQLAlchemy ORM models + async engine + Alembic async migration (incl. alembic upgrade head) +- [ ] 01-04-PLAN.md — StorageBackend ABC + MinIO backend + rewritten async services/storage.py +- [ ] 01-05-PLAN.md — Lifespan + /health + API cutover + Celery worker + walking-skeleton e2e verify --- diff --git a/.planning/STATE.md b/.planning/STATE.md index 9d5645a..90aef73 100644 --- a/.planning/STATE.md +++ b/.planning/STATE.md @@ -3,12 +3,12 @@ gsd_state_version: 1.0 milestone: v1.0 milestone_name: milestone current_phase: — -status: planning -last_updated: "2026-05-21T19:31:20.618Z" +status: executing +last_updated: "2026-05-22T06:49:02.336Z" progress: total_phases: 5 completed_phases: 0 - total_plans: 0 + total_plans: 5 completed_plans: 0 percent: 0 --- @@ -16,7 +16,7 @@ progress: # Project State **Project:** DocuVault -**Status:** Planning +**Status:** Ready to execute **Current Phase:** — **Last Updated:** 2026-05-21 diff --git a/.planning/phases/01-infrastructure-foundation/01-01-PLAN.md b/.planning/phases/01-infrastructure-foundation/01-01-PLAN.md new file mode 100644 index 0000000..dd5dbd1 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-01-PLAN.md @@ -0,0 +1,261 @@ +--- +phase: 01-infrastructure-foundation +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - docker-compose.yml + - docker/postgres/initdb.d/01-init-users.sql + - .env.example + - .gitignore + - backend/requirements.txt + - backend/config.py +autonomous: true +requirements: + - STORE-01 + - STORE-07 +user_setup: [] +tags: + - infrastructure + - docker-compose + - postgresql + - minio + - redis + - celery + - pydantic-settings + +must_haves: + truths: + - "Running `docker compose config` against the new compose file succeeds with no validation errors" + - "`.env.example` lists every variable referenced by `docker-compose.yml` via `${VAR}` interpolation" + - "`backend/config.py` exposes a Pydantic `Settings` instance reading `database_url`, `database_migrate_url`, `minio_endpoint`, `minio_access_key`, `minio_secret_key`, `minio_bucket`, `redis_url`, `secret_key` from environment" + - "`backend/requirements.txt` declares SQLAlchemy async, psycopg v3, Alembic, MinIO SDK, Celery+Redis, redis, aiosqlite for tests; `filelock` is removed" + - "`docker/postgres/initdb.d/01-init-users.sql` creates both `docuvault_app` and `docuvault_migrate` users with correct CONNECT grant" + artifacts: + - path: "docker-compose.yml" + provides: "postgres, minio, redis, celery-worker services with healthchecks and depends_on conditions" + contains: "postgres:17-alpine" + - path: "docker/postgres/initdb.d/01-init-users.sql" + provides: "docuvault_app and docuvault_migrate role provisioning" + contains: "CREATE USER docuvault_migrate" + - path: ".env.example" + provides: "Documented placeholders for DATABASE_URL, DATABASE_MIGRATE_URL, MINIO_*, REDIS_URL, SECRET_KEY" + contains: "DATABASE_MIGRATE_URL=postgresql+psycopg" + - path: "backend/config.py" + provides: "Pydantic Settings instance with all Phase 1 env vars" + contains: "class Settings(BaseSettings)" + - path: "backend/requirements.txt" + provides: "Updated dependency manifest" + contains: "sqlalchemy[asyncio]>=2.0" + key_links: + - from: "docker-compose.yml" + to: ".env.example variables" + via: "${VAR_NAME} interpolation" + pattern: "\\$\\{DATABASE_URL\\}|\\$\\{REDIS_URL\\}|\\$\\{MINIO_ENDPOINT\\}" + - from: "backend/config.py" + to: ".env.example variables" + via: "Pydantic Settings env_file" + pattern: "database_url|minio_endpoint|redis_url" + - from: "docker-compose.yml postgres service" + to: "docker/postgres/initdb.d/01-init-users.sql" + via: "bind-mount into /docker-entrypoint-initdb.d" + pattern: "initdb.d:/docker-entrypoint-initdb.d" +--- + + +Wire the Phase 1 service topology (PostgreSQL + MinIO + Redis + celery-worker) into Docker Compose, introduce the two-user PostgreSQL init script, replace the legacy `config.py` constants with a Pydantic `Settings` class that reads every Phase 1 env var, extend `.env.example` with all new variables grouped by service, and update `backend/requirements.txt` to swap `filelock` for the SQLAlchemy / Alembic / MinIO / Celery stack. + +Purpose: This is the foundation layer of the walking skeleton. Until `docker compose up` boots all five services cleanly and `Settings()` can read every Phase 1 variable, no subsequent plan can run. + +Output: A new `docker-compose.yml`, a new `docker/postgres/initdb.d/01-init-users.sql`, an extended `.env.example`, an updated `backend/requirements.txt`, and a rewritten `backend/config.py`. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-infrastructure-foundation/01-CONTEXT.md +@.planning/phases/01-infrastructure-foundation/01-RESEARCH.md +@.planning/phases/01-infrastructure-foundation/01-PATTERNS.md +@.planning/phases/01-infrastructure-foundation/SKELETON.md + + +Key existing structures the executor must preserve: + +From the current `docker-compose.yml` (only `backend` and `frontend` services exist today; backend volumes include `./backend/data:/app/data` which MUST be removed per D-04). The Compose file uses no top-level `volumes:` block yet. + +From the current `backend/config.py`: module-level constants `DATA_DIR`, `UPLOADS_DIR`, `METADATA_DIR`, `TOPICS_FILE`, `SETTINGS_FILE`, `DEFAULT_SYSTEM_PROMPT`, `DEFAULT_SETTINGS`, and a function `ensure_data_dirs()`. `DEFAULT_SYSTEM_PROMPT` and `DEFAULT_SETTINGS` must be preserved verbatim because `services/storage.py`, `services/classifier.py`, and `api/settings.py` still consume them; `DATA_DIR`/`UPLOADS_DIR`/`METADATA_DIR`/`TOPICS_FILE`/`SETTINGS_FILE`/`ensure_data_dirs` will be removed by Plan 05 once `services/storage.py` is rewritten — leave them in place for now to keep the rest of the app booting between waves. + +From `backend/requirements.txt`: `pydantic-settings>=2.2` is already declared (line 4) — no new install needed for that package. + +Env var canon (sourced from RESEARCH.md Code Examples lines 914-937 and PATTERNS.md `.env.example` section): +- `DATABASE_URL` — `postgresql+psycopg://docuvault_app:@postgres:5432/docuvault` +- `DATABASE_MIGRATE_URL` — `postgresql+psycopg://docuvault_migrate:@postgres:5432/docuvault` +- `POSTGRES_PASSWORD` — superuser password for the init container (not used by app) +- `MINIO_ROOT_USER` / `MINIO_ROOT_PASSWORD` — MinIO root, init-time only +- `MINIO_ENDPOINT` — `minio:9000` +- `MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY` — app-level access key pair +- `MINIO_BUCKET` — value `docuvault` +- `REDIS_PASSWORD` — used by Redis `--requirepass` and inside `REDIS_URL` +- `REDIS_URL` — `redis://:@redis:6379/0` +- `SECRET_KEY` — Phase 2 JWT/HKDF placeholder; documented now, not read by Phase 1 code paths + + + + + + + Task 1: Replace docker-compose.yml with five-service stack and add PostgreSQL init script + docker-compose.yml, docker/postgres/initdb.d/01-init-users.sql, .gitignore + + - docker-compose.yml (current 2-service file; remove `./backend/data:/app/data` volume per D-04) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 6: Docker Compose Health Checks; Pitfall 5: Redis healthcheck auth; Pitfall 6: MinIO mc; Pattern 7: PostgreSQL Two-User Init Script) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (docker-compose.yml section — full service block source-of-truth; initdb.d section) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-06 bucket name `docuvault`; D-09 Redis in Phase 1; D-10 celery-worker service; D-14 init script provisions both users) + + + Rewrite `docker-compose.yml` to declare exactly five services — `postgres`, `minio`, `redis`, `backend`, `celery-worker` — plus the existing `frontend` service. Use image tags `postgres:17-alpine`, `minio/minio:latest`, `redis:7-alpine`. Each infrastructure service has a `healthcheck` block: postgres uses `pg_isready -U postgres -d docuvault` (interval 10s, timeout 5s, retries 5, start_period 10s); minio uses `["CMD", "mc", "ready", "local"]` (interval 10s, timeout 5s, retries 5, start_period 15s) per D-07 (NOT `curl` — removed from image); redis uses `["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"]` (interval 10s, timeout 3s, retries 5) per Pitfall 5 — bare `redis-cli ping` returns NOAUTH. The `backend` service `depends_on` block uses `condition: service_healthy` for all three of postgres, minio, redis. The `celery-worker` service uses the same `build: ./backend`, has identical `depends_on`, and its `command:` is `celery -A celery_app worker --loglevel=info -Q documents` per D-10. Wire all environment variables on `backend` and `celery-worker` via `${VAR}` interpolation: `DATABASE_URL`, `DATABASE_MIGRATE_URL` (backend only — workers do not need DDL access), `MINIO_ENDPOINT`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, `MINIO_BUCKET`, `REDIS_URL`, plus `PYTHONDONTWRITEBYTECODE=1`. The postgres service has `POSTGRES_DB: docuvault`, `POSTGRES_USER: postgres`, `POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}`. The minio service exposes ports `9000:9000` and `9001:9001` and runs `command: server /data --console-address ":9001"`. Add a top-level `volumes:` block declaring `postgres_data:` and `minio_data:`. Mount `./docker/postgres/initdb.d:/docker-entrypoint-initdb.d:ro` on the postgres service. **Delete** the `./backend/data:/app/data` volume from the `backend` service (D-04). Keep the existing `./backend:/app` bind mount for hot reload. Keep the existing `frontend` service unchanged. Then create the directory `docker/postgres/initdb.d/` and the file `docker/postgres/initdb.d/01-init-users.sql` per Pattern 7: it must `CREATE USER docuvault_migrate WITH PASSWORD 'changeme_migrate'`, `GRANT ALL PRIVILEGES ON DATABASE docuvault TO docuvault_migrate`, `CREATE USER docuvault_app WITH PASSWORD 'changeme_app'`, and `GRANT CONNECT ON DATABASE docuvault TO docuvault_app`. Do NOT include schema-level GRANTs or `ALTER DEFAULT PRIVILEGES` here — Plan 03 handles those inside the Alembic initial migration (Pitfall 4). Add a comment to the SQL file noting that table-level grants are issued by the Alembic migration. Finally, ensure `.env` is gitignored: read the existing `.gitignore` (create one if missing), and append a `.env` line if not present. + + + cd /Users/nik/Documents/Progamming/document_scanner && docker compose --env-file .env.example config -q 2>&1 | grep -v "warning" | (! grep -q "error\|invalid"); echo "compose-config-exit=$?" + + + - `docker-compose.yml` contains exactly these top-level service keys: `postgres`, `minio`, `redis`, `backend`, `celery-worker`, `frontend` (verifiable via `grep -E "^ (postgres|minio|redis|backend|celery-worker|frontend):" docker-compose.yml | wc -l` equals 6) + - `docker-compose.yml` contains the string `postgres:17-alpine` + - `docker-compose.yml` contains the string `minio/minio:latest` + - `docker-compose.yml` contains the string `redis:7-alpine` + - `docker-compose.yml` contains the substring `mc", "ready", "local"` in the minio healthcheck (NOT `curl`) + - `docker-compose.yml` contains the substring `redis-cli", "-a"` in the redis healthcheck + - `docker-compose.yml` contains `pg_isready -U postgres -d docuvault` in the postgres healthcheck + - `docker-compose.yml` contains the substring `celery -A celery_app worker` in the celery-worker service command + - `docker-compose.yml` contains `condition: service_healthy` at least three times (one per infrastructure dependency on backend) + - `docker-compose.yml` contains a top-level `volumes:` block with both `postgres_data:` and `minio_data:` + - `docker-compose.yml` does NOT contain `./backend/data:/app/data` (D-04 removal verifiable via `grep -v "^#" docker-compose.yml | grep -c "backend/data" | grep -q "^0$"`) + - `docker-compose.yml` references `${DATABASE_URL}`, `${DATABASE_MIGRATE_URL}`, `${MINIO_ENDPOINT}`, `${MINIO_ACCESS_KEY}`, `${MINIO_SECRET_KEY}`, `${MINIO_BUCKET}`, `${REDIS_URL}`, `${REDIS_PASSWORD}`, `${POSTGRES_PASSWORD}`, `${MINIO_ROOT_USER}`, `${MINIO_ROOT_PASSWORD}` (each verifiable via `grep -c "\${VAR}" docker-compose.yml >= 1`) + - `docker compose --env-file .env.example config -q` exits 0 (no parsing errors) + - File `docker/postgres/initdb.d/01-init-users.sql` exists + - The SQL script contains `CREATE USER docuvault_migrate` and `CREATE USER docuvault_app` + - The SQL script contains `GRANT ALL PRIVILEGES ON DATABASE docuvault TO docuvault_migrate` + - The SQL script contains `GRANT CONNECT ON DATABASE docuvault TO docuvault_app` + - The SQL script does NOT contain `GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES` (those go in the Alembic migration per Pitfall 4) + - `.gitignore` contains a `.env` line (`grep -Fx ".env" .gitignore` exits 0) + + The Compose file declares the full five-service Phase 1 stack with all health checks and dependency conditions; the PostgreSQL init script provisions both DB users on first container start; `.env` is gitignored. + + + + Task 2: Extend .env.example with all Phase 1 variables (D-11, D-13, D-15, D-16) + .env.example + + - .env.example (current 6-line file with only ANTHROPIC_API_KEY and OPENAI_API_KEY) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (`.env.example` section — full block source-of-truth) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-11 .env.example committed; D-13 two DSNs; D-15 MinIO vars including separate access key; D-16 REDIS_URL + SECRET_KEY documented now) + - docker/postgres/initdb.d/01-init-users.sql (Task 1 output — verify the placeholder passwords in this file match the DATABASE_URL / DATABASE_MIGRATE_URL passwords) + + + Replace `.env.example` keeping the existing `ANTHROPIC_API_KEY=` and `OPENAI_API_KEY=` lines at the top, then append four new sections grouped by `# ── ───` comment headers: + + 1. **PostgreSQL section**: `DATABASE_URL=postgresql+psycopg://docuvault_app:changeme_app@postgres:5432/docuvault` (with explanatory comment "App user — SELECT/INSERT/UPDATE/DELETE only, used by FastAPI + Celery"), `DATABASE_MIGRATE_URL=postgresql+psycopg://docuvault_migrate:changeme_migrate@postgres:5432/docuvault` (with comment "Migration user — DDL privileges, used ONLY by Alembic, never by the app at runtime"), `POSTGRES_PASSWORD=changeme_super` (with comment "Superuser password for the postgres init container — used only by initdb.d scripts"). + + 2. **MinIO section**: `MINIO_ROOT_USER=minioadmin`, `MINIO_ROOT_PASSWORD=changeme_minio_root`, `MINIO_ENDPOINT=minio:9000`, `MINIO_ACCESS_KEY=docuvault_app` (comment: "App-level access key — minimal permissions on docuvault bucket only"), `MINIO_SECRET_KEY=changeme_minio_app`, `MINIO_BUCKET=docuvault`. + + 3. **Redis section**: `REDIS_PASSWORD=changeme_redis`, `REDIS_URL=redis://:changeme_redis@redis:6379/0` (comment noting it must match `REDIS_PASSWORD` and that the leading `:` is the no-username form for `requirepass`). + + 4. **Security (Phase 2) section**: `SECRET_KEY=CHANGEME-replace-with-64-char-random-hex` (comment: "Not read by the app in Phase 1 — documented here for Phase 2 JWT + HKDF use"). Each value uses `changeme_*` style placeholders to make obvious which fields require replacement. The DATABASE_URL password (`changeme_app`) and DATABASE_MIGRATE_URL password (`changeme_migrate`) MUST match the hardcoded passwords in `docker/postgres/initdb.d/01-init-users.sql` from Task 1 — re-read that file to confirm. End the file with a final newline. + + + grep -E '^(DATABASE_URL|DATABASE_MIGRATE_URL|POSTGRES_PASSWORD|MINIO_ROOT_USER|MINIO_ROOT_PASSWORD|MINIO_ENDPOINT|MINIO_ACCESS_KEY|MINIO_SECRET_KEY|MINIO_BUCKET|REDIS_PASSWORD|REDIS_URL|SECRET_KEY|ANTHROPIC_API_KEY|OPENAI_API_KEY)=' /Users/nik/Documents/Progamming/document_scanner/.env.example | sort -u | wc -l | awk '{exit ($1 == 14) ? 0 : 1}' + + + - `.env.example` defines exactly 14 named variables: `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `DATABASE_URL`, `DATABASE_MIGRATE_URL`, `POSTGRES_PASSWORD`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD`, `MINIO_ENDPOINT`, `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, `MINIO_BUCKET`, `REDIS_PASSWORD`, `REDIS_URL`, `SECRET_KEY` (verifiable via the Verify command) + - `DATABASE_URL` value starts with `postgresql+psycopg://docuvault_app:` (verifiable via `grep -c "^DATABASE_URL=postgresql+psycopg://docuvault_app:" .env.example` >= 1) + - `DATABASE_MIGRATE_URL` value starts with `postgresql+psycopg://docuvault_migrate:` + - `MINIO_BUCKET=docuvault` exactly (D-06) + - `REDIS_URL` value matches the form `redis://:@redis:6379/0` (verifiable via `grep -E "^REDIS_URL=redis://:[^@]+@redis:6379/0$" .env.example` exits 0) + - `MINIO_ENDPOINT=minio:9000` exactly + - Section headers are present: `grep -c "── PostgreSQL ──" .env.example` >= 1, similarly for MinIO, Redis, Security + - The password embedded in `DATABASE_URL` matches the password literal used in `docker/postgres/initdb.d/01-init-users.sql` for the `docuvault_app` user (re-read both files and confirm string equality of the password substring) + - The password embedded in `DATABASE_MIGRATE_URL` matches the password literal used in `docker/postgres/initdb.d/01-init-users.sql` for the `docuvault_migrate` user + - The password embedded in `REDIS_URL` matches `REDIS_PASSWORD` (verifiable: extract value of REDIS_PASSWORD and grep `redis://:${value}@` in REDIS_URL line) + + `.env.example` documents every Phase 1 environment variable with safe placeholder values, grouped and commented by service; the DB and Redis password placeholders are consistent between `.env.example` and the Postgres init script. + + + + Task 3: Replace backend/config.py with Pydantic Settings + extend backend/requirements.txt + backend/config.py, backend/requirements.txt + + - backend/config.py (current — preserve `DEFAULT_SYSTEM_PROMPT`, `DEFAULT_SETTINGS`, and `ensure_data_dirs()` verbatim because they are still consumed by `services/storage.py`, `services/classifier.py`, `api/settings.py` until Plan 05; rewrite the rest as a Pydantic `Settings` class) + - backend/requirements.txt (current — `pydantic-settings>=2.2` and `httpx>=0.27` and `pytest-asyncio>=0.23` already present; add new deps, remove `filelock`) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Standard Stack section: exact pinned versions; Config Extension code example lines 914-937) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/config.py section — module-level Settings instance is the established consumer interface) + - .env.example (Task 2 output — variable names must match the Settings field names lower-cased) + + + Rewrite `backend/config.py` so it imports `from pydantic_settings import BaseSettings`, defines a `Settings(BaseSettings)` class with these fields (with the listed defaults — defaults are used only if env vars are unset, which keeps tests workable): `data_dir: str = "/app/data"`, `database_url: str = "postgresql+psycopg://docuvault_app:changeme_app@postgres:5432/docuvault"`, `database_migrate_url: str = "postgresql+psycopg://docuvault_migrate:changeme_migrate@postgres:5432/docuvault"`, `minio_endpoint: str = "minio:9000"`, `minio_access_key: str = "docuvault_app"`, `minio_secret_key: str = "changeme_minio_app"`, `minio_bucket: str = "docuvault"`, `redis_url: str = "redis://:changeme_redis@redis:6379/0"`, `secret_key: str = "CHANGEME"`. Inside the class, add `model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")` (Pydantic Settings v2 API — `class Config:` is deprecated). After the class, instantiate `settings = Settings()` at module level so all callers `from config import settings`. **Preserve and keep at module level for backward compatibility through Wave 4:** the existing `DATA_DIR = Path(...)`, `UPLOADS_DIR`, `METADATA_DIR`, `TOPICS_FILE`, `SETTINGS_FILE` constants; the `DEFAULT_SYSTEM_PROMPT` string; the `DEFAULT_SETTINGS` dict; and the `ensure_data_dirs()` function. Plan 05 deletes these once `services/storage.py` is rewritten. The rewrite is additive in this plan: the existing app must still boot after this change. Then update `backend/requirements.txt`: REMOVE the line `filelock>=3.14` (RESEARCH.md "State of the Art" — replaced by PostgreSQL transactions). APPEND these new lines: `sqlalchemy[asyncio]>=2.0.49`, `psycopg[binary]>=3.3.4`, `alembic>=1.18.4`, `minio>=7.2.20`, `celery[redis]>=5.6.3`, `redis>=7.4.0`, `aiosqlite>=0.20.0` (needed by Plan 02's in-memory test engine). Bump `pytest-asyncio>=1.3.0` (existing `>=0.23` no longer supports `asyncio_mode = auto` reliably with current pytest). Keep all other existing lines. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c "import ast; tree = ast.parse(open('config.py').read()); names = {n.name for node in ast.walk(tree) for n in ([node] if isinstance(node, ast.ClassDef) else [])}; assert 'Settings' in names, 'Settings class missing'; print('settings-class-ok')" + + + - `backend/config.py` contains `class Settings(BaseSettings):` (verifiable via `grep -c "class Settings(BaseSettings)" backend/config.py` >= 1) + - `backend/config.py` contains the literal `settings = Settings()` at module level + - `backend/config.py` contains every field name `database_url`, `database_migrate_url`, `minio_endpoint`, `minio_access_key`, `minio_secret_key`, `minio_bucket`, `redis_url`, `secret_key` (each verifiable via grep) + - `backend/config.py` uses `SettingsConfigDict(env_file=".env"` (not the deprecated `class Config:` form) — verifiable via `grep -c "SettingsConfigDict" backend/config.py` >= 1 + - `backend/config.py` still exports the `DEFAULT_SYSTEM_PROMPT` and `DEFAULT_SETTINGS` constants and the `ensure_data_dirs` function (verifiable: `grep -c "DEFAULT_SYSTEM_PROMPT\|DEFAULT_SETTINGS\|def ensure_data_dirs" backend/config.py` >= 3) + - `python3 -c "import sys; sys.path.insert(0, 'backend'); from config import settings; assert settings.minio_bucket == 'docuvault'; assert settings.database_url.startswith('postgresql+psycopg://'); print('config-import-ok')"` exits 0 (executed from the project root) + - `backend/requirements.txt` no longer contains `filelock` (verifiable: `grep -c "^filelock" backend/requirements.txt | grep -q "^0$"`) + - `backend/requirements.txt` contains each of: `sqlalchemy[asyncio]>=2.0`, `psycopg[binary]>=3.3`, `alembic>=1.18`, `minio>=7.2`, `celery[redis]>=5.6`, `redis>=7.4`, `aiosqlite>=0.20`, `pytest-asyncio>=1.3` (each verifiable via `grep -F` on the line prefix) + - Existing lines preserved: `fastapi>=0.111`, `uvicorn[standard]>=0.29`, `python-multipart`, `pydantic-settings>=2.2`, `anthropic>=0.26`, `openai>=1.30`, `PyMuPDF>=1.24`, `python-docx>=1.1`, `pytesseract>=0.3`, `Pillow>=10.3`, `aiofiles>=23.2`, `httpx>=0.27`, `pytest>=8.2` (each verifiable via `grep -F`) + + `config.settings` is a Pydantic Settings instance reading every Phase 1 env var; the legacy data-dir constants remain available for the still-running flat-file code path; `requirements.txt` declares the full Phase 1 dependency set and removes `filelock`. + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| Docker host → containers | Compose orchestrates services; environment variables flow from `.env`/`/etc/docuvault/env` into containers | +| Backend container → PostgreSQL | App connects with restricted `docuvault_app` role; Alembic connects with privileged `docuvault_migrate` role | +| Backend container → MinIO | App connects with app-level access key (separate from MinIO root credentials) | +| Backend container → Redis | App + Celery worker connect with `requirepass`-protected URL | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-01-01-01 | Elevation of Privilege | PostgreSQL connection from app | mitigate | Two-DSN pattern (D-13): `DATABASE_URL` uses `docuvault_app` (DML only, no DDL); `DATABASE_MIGRATE_URL` uses `docuvault_migrate` (DDL only, used by Alembic only). Init script in Task 1 hard-codes both grants; Plan 03 issues `ALTER DEFAULT PRIVILEGES` inside the migration. | +| T-01-01-02 | Elevation of Privilege | MinIO root credentials | mitigate | `MINIO_ROOT_USER` / `MINIO_ROOT_PASSWORD` used only for MinIO container init; app uses separate `MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY` (D-15); the app never connects with root credentials. | +| T-01-01-03 | Information Disclosure | Redis unauthenticated access on Docker network | mitigate | Redis runs with `--requirepass ${REDIS_PASSWORD}` (Pattern 6); both app and worker connect via `REDIS_URL` containing the password; healthcheck passes `-a $REDIS_PASSWORD` per Pitfall 5. | +| T-01-01-04 | Information Disclosure | Secret leakage via committed `.env` file | mitigate | `.env` added to `.gitignore` in Task 1; only `.env.example` with placeholder `changeme_*` values is committed (D-11); production secrets stored outside the project at `/etc/docuvault/env` `chmod 600` (D-12, documented in SKELETON.md). | +| T-01-01-05 | Tampering | Compose service starts before its dependencies are ready | mitigate | `depends_on: condition: service_healthy` on backend and celery-worker for postgres + minio + redis (Pattern 6); replaces race-prone `sleep` patterns. | +| T-01-01-SC | Tampering | npm/pip supply chain on dependency install | mitigate | RESEARCH.md Package Legitimacy Audit verified all 6 new packages on PyPI via `pip3 index versions` + slopcheck OK; no `[ASSUMED]` or `[SUS]` packages — no checkpoint required this plan. | + + + +- After all three tasks complete, run `cd /Users/nik/Documents/Progamming/document_scanner && docker compose --env-file .env.example config -q` — must exit 0. +- Confirm `cd backend && python3 -c "from config import settings; print(settings.minio_bucket)"` prints `docuvault`. +- `grep -c "filelock" backend/requirements.txt` must equal 0. + + + +- `docker-compose.yml`, `docker/postgres/initdb.d/01-init-users.sql`, `.env.example`, `backend/config.py`, and `backend/requirements.txt` are all updated according to the acceptance criteria above. +- `docker compose --env-file .env.example config -q` exits 0. +- `from config import settings` works in Python without raising; `settings.minio_bucket == "docuvault"`. +- `.env` is gitignored and `.env.example` documents every env var referenced by `docker-compose.yml`. +- No existing flat-file constants are deleted yet — the app must still boot after this plan in isolation (Plan 05 completes the cutover). + + + +Create `.planning/phases/01-infrastructure-foundation/01-01-SUMMARY.md` when done summarizing: services declared, env vars introduced, dependencies added/removed, and any deviations from the plan. + diff --git a/.planning/phases/01-infrastructure-foundation/01-02-PLAN.md b/.planning/phases/01-infrastructure-foundation/01-02-PLAN.md new file mode 100644 index 0000000..b8c5117 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-02-PLAN.md @@ -0,0 +1,292 @@ +--- +phase: 01-infrastructure-foundation +plan: 02 +type: execute +wave: 1 +depends_on: [] +files_modified: + - backend/tests/conftest.py + - backend/tests/test_health.py + - backend/tests/test_storage.py + - backend/tests/test_documents.py + - backend/tests/test_alembic.py +autonomous: true +requirements: + - STORE-01 + - STORE-02 + - STORE-07 +user_setup: [] +tags: + - testing + - wave-0 + - pytest + - tdd + +must_haves: + truths: + - "Test scaffolds exist for every Wave 0 gap identified in VALIDATION.md before any implementation runs" + - "`tests/test_storage.py` exists with at least one passing unit test that asserts the MinIO object key format and at least one passing unit test that asserts the human filename is NOT in the object key" + - "`tests/test_health.py` declares `test_health_checks_postgres_and_minio` that fails with a clear xfail/skip marker today and is ready to pass once Plan 05 ships" + - "`tests/test_alembic.py` exists and contains at least one test that asserts the initial migration applies and creates the expected tables" + - "`tests/conftest.py` provides an async `db_session` fixture (in-memory SQLite via aiosqlite) and an async `client` fixture that overrides `get_db`" + - "The full pytest run completes — Wave 0 tests for not-yet-implemented behavior are marked `xfail(strict=False)` or `skip(reason=...)` so the suite stays green" + artifacts: + - path: "backend/tests/test_storage.py" + provides: "Wave-0 unit tests for STORE-02 (object key schema, filename isolation)" + min_lines: 40 + - path: "backend/tests/test_alembic.py" + provides: "Wave-0 integration test that runs `alembic upgrade head` against an in-memory engine and asserts table existence" + min_lines: 30 + - path: "backend/tests/test_health.py" + provides: "Extended health probe assertions for postgres+minio" + contains: "test_health_checks_postgres_and_minio" + - path: "backend/tests/conftest.py" + provides: "Async SQLAlchemy engine + AsyncClient fixtures replacing the flat-file isolation fixture" + contains: "AsyncClient" + key_links: + - from: "tests/conftest.py" + to: "db/models.Base.metadata" + via: "create_all on aiosqlite engine" + pattern: "Base\\.metadata\\.create_all" + - from: "tests/conftest.py" + to: "deps.db.get_db" + via: "app.dependency_overrides[get_db]" + pattern: "dependency_overrides\\[get_db\\]" + - from: "tests/test_alembic.py" + to: "alembic command runner" + via: "subprocess or alembic.command.upgrade" + pattern: "alembic\\.command\\.upgrade|subprocess.*alembic" +--- + + +Author the Wave 0 test scaffolds named in `01-VALIDATION.md` BEFORE any implementation lands. Each test states the expected behavior of Plan 03-05 deliverables; the tests for code that does not yet exist are marked `xfail(strict=False)` or `skip(reason="implemented in plan NN")` so the suite stays green between waves. This enforces TDD discipline: every executor in later plans must remove the xfail/skip marker once their code lands. + +Purpose: Wave 0 fills the validation gaps catalogued in `01-VALIDATION.md` Section "Wave 0 Gaps" so that every later task has a meaningful `` verify command. Without this plan, later tasks would have no automated test target and would silently regress to "smoke check by hand." + +Output: Five test files (one new, four updated) plus a refreshed `conftest.py` with async SQLAlchemy fixtures. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-infrastructure-foundation/01-CONTEXT.md +@.planning/phases/01-infrastructure-foundation/01-RESEARCH.md +@.planning/phases/01-infrastructure-foundation/01-PATTERNS.md +@.planning/phases/01-infrastructure-foundation/01-VALIDATION.md + + +The tests in this plan describe the interface that Plans 03, 04, and 05 must build to. They are intentionally written BEFORE the implementation. + +Interfaces under test (will exist after Plans 03-05): + +```python +# backend/db/models.py — created by Plan 03 +class Base(DeclarativeBase): ... +class User(Base): __tablename__ = "users"; id, handle, email, ... +class Document(Base): __tablename__ = "documents"; id, user_id (NULLABLE in Phase 1 per D-03), filename, object_key, ... +class Topic(Base): __tablename__ = "topics"; id, user_id, name, description, color +class CloudConnection(Base): __tablename__ = "cloud_connections"; ... +class Group(Base): __tablename__ = "groups" # D-02 stub +# Full table list: users, quotas, refresh_tokens, folders, documents, topics, document_topics, shares, audit_log, cloud_connections, groups + +# backend/deps/db.py — created by Plan 03 +async def get_db() -> AsyncGenerator[AsyncSession, None]: ... + +# backend/storage/base.py — created by Plan 04 +class StorageBackend(ABC): + async def put_object(user_id, document_id, file_bytes, extension, content_type) -> str # returns object_key + async def get_object(object_key) -> bytes + async def delete_object(object_key) -> None + async def presigned_get_url(object_key, expires_minutes=60) -> str + async def health_check() -> bool + +# backend/storage/minio_backend.py — created by Plan 04 +class MinIOBackend(StorageBackend): ... + +# backend/main.py — modified by Plan 05; /health response shape: +# {"status": "ok"|"degraded", "checks": {"postgres": "ok"|"error: ...", "minio": "ok"|"error: ..."}} +``` + +Existing files referenced by tests: +- `backend/main.py` exports `app: FastAPI` (current top-level binding at line 16) +- `backend/api/documents.py` exposes `POST /api/documents/upload`, `GET /api/documents`, `GET /api/documents/{doc_id}`, `DELETE /api/documents/{doc_id}` (existing route table) +- `backend/pytest.ini` already sets `asyncio_mode = auto` and `testpaths = tests` + + + + + + + Task 1: Rewrite tests/conftest.py with async SQLAlchemy + AsyncClient fixtures + backend/tests/conftest.py + + - The `db_session` fixture yields a working `AsyncSession` bound to an aiosqlite in-memory engine with `Base.metadata.create_all` applied; the session uses `expire_on_commit=False` + - The `client` fixture returns an `httpx.AsyncClient` with `ASGITransport(app)` and overrides `deps.db.get_db` to yield the test session + - Both fixtures are `pytest_asyncio.fixture` decorated and clean up engine/session/overrides on teardown + - Existing `sample_txt` and `sample_pdf` fixtures continue to work unchanged + - The autouse `isolated_data_dir` fixture is RETAINED for now (still monkeypatches `config.DATA_DIR` and friends) so the unchanged `services/storage.py` / `services/classifier.py` still run during the transition; Plan 05 removes it as part of the cutover + + + - backend/tests/conftest.py (current 71-line file — preserve `sample_txt`, `sample_pdf`, the `isolated_data_dir` autouse fixture verbatim; ADD new async fixtures alongside, do not remove the old ones in this plan) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (`backend/tests/conftest.py` section — full code example for async fixture) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 1: SQLAlchemy async engine; `expire_on_commit=False` required) + - backend/pytest.ini (verify `asyncio_mode = auto` is set) + + + Edit `backend/tests/conftest.py` to ADD (do not remove existing fixtures) the following at the top of the file: imports for `pytest_asyncio`, `httpx.AsyncClient`, `httpx.ASGITransport`, `sqlalchemy.ext.asyncio.create_async_engine`, `sqlalchemy.ext.asyncio.async_sessionmaker`, `sqlalchemy.ext.asyncio.AsyncSession`, `sqlalchemy.pool.StaticPool`. Define an `@pytest_asyncio.fixture` named `db_session` that: (1) creates an async engine via `create_async_engine("sqlite+aiosqlite:///:memory:", connect_args={"check_same_thread": False}, poolclass=StaticPool)`; (2) inside `async with engine.begin() as conn:` block, tries `from db.models import Base; await conn.run_sync(Base.metadata.create_all)` — wrap this in a try/except `ImportError` that pytest.skip-s the test with reason `"db.models not yet implemented — plan 03"` so the fixture is safe during Wave 1; (3) builds `AsyncTestSession = async_sessionmaker(engine, expire_on_commit=False)`; (4) yields a session via `async with AsyncTestSession() as session: yield session`; (5) calls `await engine.dispose()` on teardown. Define an `@pytest_asyncio.fixture` named `async_client` (NEW NAME — keep existing sync `client` fixture intact for legacy tests) that: takes `db_session` as a dependency, attempts `from deps.db import get_db; from main import app; app.dependency_overrides[get_db] = lambda: db_session`, wraps in try/except `ImportError` that pytest.skip-s with reason `"deps.db.get_db not yet implemented — plan 03"`, yields `async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: yield c`, then `app.dependency_overrides.clear()` on teardown. Keep the existing autouse `isolated_data_dir`, `client` (sync TestClient), `sample_txt`, `sample_pdf` fixtures verbatim. Add an explanatory comment at the top of the file: "Async fixtures (db_session, async_client) are added for Phase 1 — sync fixtures remain until Plan 05 cuts over." + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -m pytest tests/ -v --collect-only 2>&1 | tail -20 ; cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -m pytest tests/conftest.py --co -q 2>&1 | head -10 + + + - `backend/tests/conftest.py` contains `@pytest_asyncio.fixture` at least twice (one each for `db_session` and `async_client`) — verifiable via `grep -c "@pytest_asyncio.fixture" backend/tests/conftest.py >= 2` + - File contains `from sqlalchemy.ext.asyncio import create_async_engine` (exact import) + - File contains the literal string `sqlite+aiosqlite:///:memory:` + - File contains `expire_on_commit=False` + - File contains `ASGITransport(app=app)` + - File contains `app.dependency_overrides[get_db]` (verifying the override pattern is wired) + - File still contains the existing `sample_txt` and `sample_pdf` and `isolated_data_dir` fixture definitions (each verifiable via `grep -c "def sample_txt\|def sample_pdf\|def isolated_data_dir" backend/tests/conftest.py >= 3`) + - `cd backend && python3 -m pytest tests/ --collect-only -q` exits 0 (collection succeeds even with new fixtures referencing not-yet-existing modules, because pytest.skip is used on ImportError) + - `cd backend && python3 -m pytest tests/test_health.py -v` exits 0 (the existing sync health test still passes) + + conftest.py provides both sync and async fixtures; existing tests continue to pass; new async fixtures are ready for Plan 03+ tests but degrade gracefully (pytest.skip) until db/models and deps/db.py exist. + + + + Task 2: Create tests/test_storage.py + tests/test_alembic.py (Wave 0 scaffolds) + backend/tests/test_storage.py, backend/tests/test_alembic.py + + - `test_object_key_schema`: object keys generated by `MinIOBackend.put_object` MUST match the regex `^[^/]+/[^/]+/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}(\.[a-zA-Z0-9]+)?$` (STORE-02) + - `test_filename_not_in_object_key`: passing `original_name="invoice_Q3_2025_secret.pdf"` to a fake/mocked MinIO put MUST produce a key whose middle segment (uuid4 part) does NOT contain `invoice` or `Q3` or `secret` + - `test_storage_backend_abc_methods`: instantiating a class that inherits `StorageBackend` without implementing all 5 abstract methods raises `TypeError` + - `test_get_storage_backend_returns_minio`: the factory `get_storage_backend()` returns an instance of `MinIOBackend` + - `test_migration_creates_all_tables`: after `alembic upgrade head`, the database has tables `users`, `quotas`, `refresh_tokens`, `folders`, `documents`, `topics`, `document_topics`, `shares`, `audit_log`, `cloud_connections`, `groups` (11 tables) + - `test_documents_user_id_nullable`: in the freshly migrated schema, the `documents.user_id` column has `is_nullable=YES` (D-03) + + + - .planning/phases/01-infrastructure-foundation/01-VALIDATION.md (Wave 0 Requirements: tests/test_storage.py and tests/test_alembic.py) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Code Examples: schema lines 769-908 for table names; Pattern 8 for ABC; Architecture Patterns lines 612-651 for factory) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/tests/test_storage.py section) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-01 full v1 schema; D-02 groups stub; D-03 documents.user_id nullable; D-06 object key schema) + + + Create `backend/tests/test_storage.py` containing six top-level async tests, each marked with `@pytest.mark.xfail(strict=False, reason="implemented in plan 04")` so the suite stays green until Plan 04 ships: + + 1. `test_object_key_schema(db_session)`: try `from storage.minio_backend import MinIOBackend`; build an instance with stubbed `Minio` client (use `unittest.mock.MagicMock` for `self._client`); call `await backend.put_object(user_id="11111111-1111-1111-1111-111111111111", document_id="22222222-2222-2222-2222-222222222222", file_bytes=b"x", extension=".pdf", content_type="application/pdf")`; assert returned key matches the regex `r'^[^/]+/[^/]+/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}(\.[a-zA-Z0-9]+)?$'` and assert the middle UUID segment is NOT equal to the user_id or document_id; assert the file extension `.pdf` is preserved at the tail. + + 2. `test_filename_not_in_object_key()`: same setup; pass extension `.pdf` but record that the human filename `invoice_Q3_2025_secret.pdf` is never passed into the SDK at all — assert `"invoice"` not in returned key, `"Q3"` not in returned key, `"secret"` not in returned key. + + 3. `test_storage_backend_abc_methods()`: try `from storage.base import StorageBackend`; define a local class `Stub(StorageBackend): pass`; assert `pytest.raises(TypeError)` when `Stub()` is instantiated (because all 5 abstract methods are unimplemented). + + 4. `test_get_storage_backend_returns_minio()`: try `from storage import get_storage_backend; from storage.minio_backend import MinIOBackend`; assert `isinstance(get_storage_backend(), MinIOBackend)`. + + 5. `test_put_object_uses_asyncio_to_thread(monkeypatch)`: assert that `MinIOBackend.put_object` does NOT call `self._client.put_object` directly inside the async function — it must wrap with `asyncio.to_thread` (verifiable by monkeypatching `asyncio.to_thread` to a tracking mock and asserting it was called with `self._client.put_object` as the first arg). RESEARCH.md Pattern 3. + + 6. `test_minio_backend_health_check_returns_bool()`: stub `self._client.bucket_exists` to return `True`; await `health_check()`; assert return is exactly `True`. Then stub it to raise `Exception("boom")`; assert `health_check()` returns `False`. + + Each test wraps its imports in `try/except ImportError as e: pytest.skip(f"{e}")` so they collect cleanly before Plan 04 lands. + + Create `backend/tests/test_alembic.py` containing two tests, each marked `@pytest.mark.xfail(strict=False, reason="implemented in plan 03")`: + + 1. `test_migration_creates_all_tables(tmp_path, monkeypatch)`: create a fresh aiosqlite DB file under tmp_path; set `DATABASE_MIGRATE_URL` env var to `sqlite+aiosqlite:///`; invoke `alembic.command.upgrade(Config("backend/alembic.ini"), "head")` (use the python API not subprocess); connect with an async engine; query `sqlite_master` for table names; assert the set `{"users","quotas","refresh_tokens","folders","documents","topics","document_topics","shares","audit_log","cloud_connections","groups"}` is a subset of the materialized tables. NOTE: Alembic on aiosqlite is acceptable for this test only — production uses PostgreSQL. + + 2. `test_documents_user_id_nullable(tmp_path)`: after running upgrade, run `PRAGMA table_info(documents)` (SQLite) or `INFORMATION_SCHEMA.COLUMNS` query for PostgreSQL targets; assert the `user_id` column's `notnull` flag is `0` / `is_nullable == 'YES'` (D-03). + + Wrap Alembic imports in `try/except ImportError: pytest.skip(...)`. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -m pytest tests/test_storage.py tests/test_alembic.py -v 2>&1 | tail -30 + + + - File `backend/tests/test_storage.py` exists and contains all six test function names: `test_object_key_schema`, `test_filename_not_in_object_key`, `test_storage_backend_abc_methods`, `test_get_storage_backend_returns_minio`, `test_put_object_uses_asyncio_to_thread`, `test_minio_backend_health_check_returns_bool` (verifiable via `grep -c "^async def test_\|^def test_" backend/tests/test_storage.py >= 6`) + - Each test in `test_storage.py` carries `@pytest.mark.xfail(strict=False` (verifiable via `grep -c "@pytest.mark.xfail" backend/tests/test_storage.py >= 6`) + - File `backend/tests/test_alembic.py` exists and contains both test function names `test_migration_creates_all_tables`, `test_documents_user_id_nullable` + - `cd backend && python3 -m pytest tests/test_storage.py tests/test_alembic.py -v` exits 0 (xfail/skip both count as non-failing) + - Output of the same pytest run mentions `xfailed` or `skipped` at least 8 times in total (6 + 2) — verifiable via `python3 -m pytest tests/test_storage.py tests/test_alembic.py -v | grep -E "xfail|XFAIL|skipped|SKIPPED" | wc -l >= 8` + - The regex literal used to match object keys in `test_object_key_schema` is exactly `r'^[^/]+/[^/]+/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}(\.[a-zA-Z0-9]+)?$'` (verifiable via grep with the literal pattern) + - `test_alembic.py` references all 11 table names from the schema: `users`, `quotas`, `refresh_tokens`, `folders`, `documents`, `topics`, `document_topics`, `shares`, `audit_log`, `cloud_connections`, `groups` (verifiable via `for t in users quotas refresh_tokens folders documents topics document_topics shares audit_log cloud_connections groups; do grep -q "\"$t\"\|'$t'" backend/tests/test_alembic.py || echo MISSING:$t; done` produces no MISSING lines) + + Wave 0 unit tests for Plan 04 (storage) and Plan 03 (migration) exist as xfail-marked tests; the suite collects and passes; the tests define the contract that Plans 03 + 04 must satisfy. + + + + Task 3: Extend tests/test_health.py + port tests/test_documents.py to the async client + backend/tests/test_health.py, backend/tests/test_documents.py + + - `test_health_status_ok`: `GET /health` returns 200 and `data["status"]` is the string `"ok"` (unchanged behavior — keeps Plan 01 green) + - `test_health_checks_postgres_and_minio` (xfail until Plan 05): response JSON has a `checks` dict with keys `postgres` and `minio` both equal to `"ok"` + - Existing document upload/list/get/delete tests are PORTED to the async client (`def` → `async def`, `client.X(...)` → `await async_client.X(...)`) — every existing assertion is preserved verbatim; the new async-port tests are xfail until Plan 05 lands the storage rewrite + - The current sync versions are NOT deleted in this plan — Plan 05 deletes them as part of the cutover so the existing flat-file code stays validated until then + - One new test `test_upload_persists_to_postgres_and_minio(async_client, sample_txt)` (xfail until Plan 05) asserts that after a successful upload, the response includes both an `id` (uuid string) and the document is queryable via `GET /api/documents/{id}` returning the same metadata + + + - backend/tests/test_health.py (current 5-line file) + - backend/tests/test_documents.py (current 108-line file — all existing test functions must be preserved during this plan) + - .planning/phases/01-infrastructure-foundation/01-VALIDATION.md (Wave 0 Requirements: extend test_health.py and test_documents.py) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/tests/test_health.py section — `test_health_checks_postgres_and_minio` source pattern; backend/tests/test_documents.py section — sync→async port pattern) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Phase Requirements → Test Map; STORE-07 health checks) + + + Edit `backend/tests/test_health.py`: KEEP the existing `test_health(client)` test exactly as-is (renaming is not required; it documents the current behavior). APPEND a new test `test_health_checks_postgres_and_minio(async_client)` that issues `await async_client.get("/health")` and asserts: `resp.status_code == 200`, `data := resp.json()`, `"checks" in data`, `"postgres" in data["checks"]`, `"minio" in data["checks"]`, `data["checks"]["postgres"] == "ok"`, `data["checks"]["minio"] == "ok"`, and `data["status"] == "ok"`. Mark this new test `@pytest.mark.xfail(strict=False, reason="extended health probe implemented in plan 05")`. + + Edit `backend/tests/test_documents.py`: KEEP all existing sync tests verbatim. APPEND a new section commented `# ── Async port (Plan 05 cutover) ─────────────────────────` containing async versions of each existing test under names with the suffix `_async`: `test_upload_txt_no_classify_async`, `test_upload_pdf_no_classify_async`, `test_list_documents_async`, `test_list_documents_filter_by_topic_async`, `test_get_document_async`, `test_get_document_not_found_async`, `test_delete_document_async`, `test_delete_document_not_found_async`, `test_upload_empty_file_async`. Each `_async` test is `async def`, takes `async_client` instead of `client`, uses `await async_client.post(...)`/`await async_client.get(...)`/`await async_client.delete(...)`, preserves every assertion from its sync counterpart, and is marked `@pytest.mark.xfail(strict=False, reason="async storage layer implemented in plan 05")`. Additionally, ADD one new test `test_upload_persists_to_postgres_and_minio_async(async_client, sample_txt)` (same xfail marker) that uploads `sample_txt`, parses the returned JSON, asserts `data["id"]` matches the uuid regex `r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'`, then `GET /api/documents/{id}` and asserts the metadata round-trips with `data["original_name"] == "sample.txt"`. For the topic-filter port (`test_list_documents_filter_by_topic_async`), replace the `import services.storage as st; st.update_document_topics(...)` step with a direct SQL update against `db_session` (e.g., `await db_session.execute(update(Document).where(...).values(...))`) — wrap this in `try/except ImportError: pytest.skip(...)` because the model imports may not exist yet. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -m pytest tests/test_health.py tests/test_documents.py -v 2>&1 | tail -30 + + + - `backend/tests/test_health.py` contains BOTH `def test_health(` (existing sync test, unchanged) AND `async def test_health_checks_postgres_and_minio(` (new async test) + - The new `test_health_checks_postgres_and_minio` is marked `@pytest.mark.xfail` + - `cd backend && python3 -m pytest tests/test_health.py::test_health -v` exits 0 with `passed` status (the existing test still passes) + - `backend/tests/test_documents.py` contains every existing sync test name (`test_upload_txt_no_classify`, `test_upload_pdf_no_classify`, `test_list_documents`, `test_list_documents_filter_by_topic`, `test_get_document`, `test_get_document_not_found`, `test_delete_document`, `test_delete_document_not_found`, `test_upload_empty_file`) — verifiable via grep for each + - `backend/tests/test_documents.py` contains all nine `_async` counterparts plus `test_upload_persists_to_postgres_and_minio_async` — verifiable via `grep -c "^async def test_.*_async\b" backend/tests/test_documents.py >= 9` + - At least 10 `@pytest.mark.xfail` markers are present in `test_documents.py` (9 ports + 1 persistence test) + - `cd backend && python3 -m pytest tests/test_documents.py -v` exits 0 (sync tests pass, async tests xfail) — verify in output that the sync tests `test_upload_txt_no_classify` etc. report `PASSED` + - Total Wave-0 xfail count across the suite: `cd backend && python3 -m pytest tests/ -v 2>&1 | grep -cE "XFAIL|xfail"` >= 18 (6 storage + 2 alembic + 1 health + 9 async-port + 1 persistence = 19 at minimum) + + Health and document test files now contain both the legacy sync tests (still passing) and the new async/PostgreSQL/MinIO-backed tests (xfail until Plan 05); the full suite is green; Wave 0 gaps from VALIDATION.md are filled with executable scaffolds. + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| Test harness → backend code | Tests instantiate `MinIOBackend` and ORM models in isolation; in-memory aiosqlite engine prevents test pollution of real services | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-01-02-01 | Information Disclosure | Test fixtures leaking sensitive data into committed test files | mitigate | All test data is synthetic (`invoice_Q3_2025_secret.pdf` is a literal string, no real PII); aiosqlite DB lives in `tmp_path` and `:memory:` only | +| T-01-02-02 | Tampering | Object key schema regression introduces filename leakage | mitigate | `test_filename_not_in_object_key` asserts the human filename is never present in the returned object key (STORE-02); regression would xfail-flip to FAILED and break CI | +| T-01-02-03 | Tampering | Migration creates documents.user_id as NOT NULL (violates D-03) | mitigate | `test_documents_user_id_nullable` asserts the column's `notnull` flag is 0; regression breaks the test | +| T-01-02-SC | Tampering | npm/pip/cargo installs | N/A | No new package installs in this plan; tests reuse Plan 01's dependency set | + + + +- `cd backend && python3 -m pytest tests/ -v` exits 0. +- The suite reports at least 18 `XFAIL` results plus the existing sync tests as PASSED. +- All new test files import their not-yet-existing dependencies inside `try/except ImportError: pytest.skip(...)` blocks so collection never fails. + + + +- `tests/conftest.py` provides both legacy sync fixtures (unchanged) and new async `db_session` + `async_client` fixtures. +- `tests/test_storage.py` and `tests/test_alembic.py` exist and collect cleanly. +- `tests/test_health.py` carries the extended-health-probe scaffold; `tests/test_documents.py` carries an async port of every existing test plus a new persistence test. +- Every test that depends on Plan 03+ code is `xfail(strict=False)` so the suite stays green between waves. +- Total xfail count >= 18 across the new scaffolds. + + + +Create `.planning/phases/01-infrastructure-foundation/01-02-SUMMARY.md` when done — list every test added, every xfail marker, and the total xfail count. + diff --git a/.planning/phases/01-infrastructure-foundation/01-03-PLAN.md b/.planning/phases/01-infrastructure-foundation/01-03-PLAN.md new file mode 100644 index 0000000..2734b6c --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-03-PLAN.md @@ -0,0 +1,357 @@ +--- +phase: 01-infrastructure-foundation +plan: 03 +type: execute +wave: 2 +depends_on: + - 01-01 + - 01-02 +files_modified: + - backend/db/__init__.py + - backend/db/models.py + - backend/db/session.py + - backend/deps/__init__.py + - backend/deps/db.py + - backend/alembic.ini + - backend/migrations/env.py + - backend/migrations/script.py.mako + - backend/migrations/versions/0001_initial_schema.py +autonomous: false +requirements: + - STORE-01 + - STORE-07 +user_setup: [] +tags: + - schema + - sqlalchemy + - alembic + - postgresql + - migration + +must_haves: + truths: + - "`backend/db/models.py` declares all 11 SQLAlchemy ORM model classes for the full v1 schema (D-01, D-02)" + - "`documents.user_id` is declared as nullable in the ORM and emitted as nullable in the generated migration (D-03)" + - "`backend/db/session.py` exposes `engine` and `AsyncSessionLocal` reading from `config.settings.database_url` with `expire_on_commit=False`" + - "`backend/deps/db.py` exposes an async `get_db()` dependency yielding an `AsyncSession`" + - "`backend/alembic.ini` references `DATABASE_MIGRATE_URL` via env interpolation (`%(DATABASE_MIGRATE_URL)s`)" + - "`backend/migrations/env.py` uses `async_engine_from_config` + `asyncio.run`, imports `db.models.Base` so `target_metadata = Base.metadata` populates" + - "`backend/migrations/versions/0001_initial_schema.py` creates all 11 tables and ends with `op.execute(\"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO docuvault_app;\")` plus an immediate `GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO docuvault_app;`" + - "`alembic upgrade head` runs cleanly against a live PostgreSQL instance" + - "Test scaffolds `tests/test_alembic.py::test_migration_creates_all_tables` and `tests/test_alembic.py::test_documents_user_id_nullable` flip from XFAIL to PASSED" + artifacts: + - path: "backend/db/models.py" + provides: "Full v1 ORM schema — User, Quota, RefreshToken, Folder, Document, Topic, DocumentTopic, Share, AuditLog, CloudConnection, Group" + contains: "class Document(Base)" + min_lines: 130 + - path: "backend/db/session.py" + provides: "Async engine + async_sessionmaker" + contains: "create_async_engine" + - path: "backend/deps/db.py" + provides: "FastAPI dependency yielding AsyncSession" + contains: "async def get_db" + - path: "backend/alembic.ini" + provides: "Alembic config pointing at DATABASE_MIGRATE_URL" + contains: "script_location = migrations" + - path: "backend/migrations/env.py" + provides: "Async migration env wiring Base.metadata" + contains: "async_engine_from_config" + - path: "backend/migrations/versions/0001_initial_schema.py" + provides: "Initial v1 schema migration" + contains: "op.create_table" + key_links: + - from: "backend/migrations/env.py" + to: "backend/db/models.py" + via: "from db.models import Base" + pattern: "from db\\.models import Base" + - from: "backend/db/session.py" + to: "config.settings.database_url" + via: "create_async_engine(settings.database_url, ...)" + pattern: "create_async_engine\\(settings\\.database_url" + - from: "backend/alembic.ini" + to: "env var DATABASE_MIGRATE_URL" + via: "%(DATABASE_MIGRATE_URL)s interpolation" + pattern: "%\\(DATABASE_MIGRATE_URL\\)s" +--- + + +Establish the database layer: declare the full v1 SQLAlchemy ORM schema in `backend/db/models.py` (D-01, D-02, D-03), wire async engine + session factory in `backend/db/session.py`, expose a FastAPI dependency in `backend/deps/db.py`, scaffold Alembic with the async template, and author the initial migration `0001_initial_schema.py`. The plan ends with a human-verify checkpoint where we boot PostgreSQL via Docker Compose and run `alembic upgrade head` to confirm the migration applies cleanly against the live database — the Phase 1 success criterion (ROADMAP.md criterion #2). + +Purpose: Without a working schema, no document can be stored, no health check can probe PostgreSQL, and the walking skeleton cannot complete its end-to-end loop. This plan delivers the data foundation that Plans 04 and 05 will write to. + +Output: Five new Python files under `backend/db/`, `backend/deps/`, and `backend/migrations/`; an `alembic.ini`; the initial migration file; and verified `alembic upgrade head` success against the live Docker PostgreSQL. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-infrastructure-foundation/01-CONTEXT.md +@.planning/phases/01-infrastructure-foundation/01-RESEARCH.md +@.planning/phases/01-infrastructure-foundation/01-PATTERNS.md +@.planning/phases/01-infrastructure-foundation/01-VALIDATION.md +@.planning/phases/01-infrastructure-foundation/SKELETON.md +@.planning/phases/01-infrastructure-foundation/01-01-SUMMARY.md +@.planning/phases/01-infrastructure-foundation/01-02-SUMMARY.md + + +This plan creates the contracts other plans depend on. Subsequent plans MUST import from these locations only: + +```python +# backend/db/models.py — exports +class Base(DeclarativeBase): pass +class User(Base): __tablename__ = "users" +class Quota(Base): __tablename__ = "quotas" +class RefreshToken(Base): __tablename__ = "refresh_tokens" +class Folder(Base): __tablename__ = "folders" +class Document(Base): __tablename__ = "documents" # user_id NULLABLE per D-03 +class Topic(Base): __tablename__ = "topics" +class DocumentTopic(Base): __tablename__ = "document_topics" +class Share(Base): __tablename__ = "shares" +class AuditLog(Base): __tablename__ = "audit_log" +class CloudConnection(Base): __tablename__ = "cloud_connections" +class Group(Base): __tablename__ = "groups" # D-02 stub + +# backend/db/session.py — exports +engine: AsyncEngine # bound to settings.database_url +AsyncSessionLocal: async_sessionmaker[AsyncSession] # expire_on_commit=False + +# backend/deps/db.py — exports +async def get_db() -> AsyncGenerator[AsyncSession, None] +``` + +Existing code that consumes the new layer: +- Plan 04 imports `Base`, all model classes, and `AsyncSessionLocal` +- Plan 05 imports `AsyncSessionLocal` (lifespan) and `get_db` (route handlers) +- Plan 02's `tests/conftest.py` already references `from db.models import Base` and `from deps.db import get_db` inside try/except blocks — those imports must succeed after this plan + +URL prefix is `postgresql+psycopg://` (RESEARCH.md Pattern 1 — same driver for sync Alembic and async FastAPI). + + + + + + + Task 1: Author backend/db/models.py — full v1 SQLAlchemy 2.0 ORM schema + backend/db/__init__.py, backend/db/models.py, backend/db/session.py, backend/deps/__init__.py, backend/deps/db.py + + - Importing `from db.models import Base` succeeds and `Base.metadata.tables` contains all 11 expected table names + - `Document.__table__.c.user_id.nullable is True` (D-03) + - `Group` class exists with `__tablename__ = "groups"` even though it has no rows (D-02 stub) + - `from db.session import engine, AsyncSessionLocal` returns objects whose URL matches `settings.database_url` and whose sessionmaker has `expire_on_commit=False` + - `from deps.db import get_db` returns an async generator function + - `Quota.limit_bytes` default equals `104857600` (100 MB) + - All UUID-primary-keyed tables use `UUID(as_uuid=True)` with `default=uuid.uuid4` + - All timestamp columns use `TIMESTAMP(timezone=True)` with `server_default=func.now()` + - The `Document` model has indexes `ix_documents_user_folder` and `ix_documents_user_created` + - The `AuditLog` model uses `INET` for `ip_address` and `JSONB` for `metadata` (PostgreSQL-specific dialect types) + + + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Code Examples lines 769-908 — full schema; Pattern 1 lines 240-289 — engine + session factory; Config Extension lines 914-937) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/db/models.py, backend/db/session.py, backend/deps/db.py sections) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-01 full schema; D-02 groups stub; D-03 documents.user_id nullable) + - backend/config.py (after Plan 01 — must contain `settings.database_url`) + - backend/tests/test_alembic.py (Plan 02 output — defines expected table set and nullable assertion) + + + Create directory `backend/db/` containing an empty `__init__.py` and a `models.py` that declares the full v1 schema using SQLAlchemy 2.0 typed `Mapped[]` syntax. Imports: `import uuid`, `from datetime import datetime`, `from sqlalchemy import Boolean, BigInteger, ForeignKey, Index, String, Text, TIMESTAMP, UniqueConstraint, Integer`, `from sqlalchemy.dialects.postgresql import UUID, INET, JSONB`, `from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship`, `from sqlalchemy.sql import func`. Declare `class Base(DeclarativeBase): pass`. Declare the eleven model classes exactly per RESEARCH.md lines 788-908 (verbatim — that block was designed to be implementation-ready): + `User`, `Quota`, `RefreshToken`, `Folder`, `Document` (with `user_id: Mapped[uuid.UUID | None]` and `nullable=True` per D-03), `Topic`, `DocumentTopic` (composite PK), `Share`, `AuditLog` (with `INET` for `ip_address`, `JSONB` for the `metadata_` column — rename to `metadata_` and pass `name="metadata"` as a `mapped_column` kwarg because `metadata` shadows the SQLAlchemy reserved attribute on `DeclarativeBase`; reflect this column name choice in the migration too), `CloudConnection`, `Group` (D-02 stub). + + Create `backend/db/session.py` per RESEARCH.md Pattern 1: import `from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession`, import `from config import settings`. Declare module-level `engine = create_async_engine(settings.database_url, pool_pre_ping=True, echo=False)` and `AsyncSessionLocal = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)` — `expire_on_commit=False` is mandatory (Pitfall 1). + + Create directory `backend/deps/` containing an empty `__init__.py` and a `db.py` with: `from db.session import AsyncSessionLocal`; `async def get_db():` that wraps `async with AsyncSessionLocal() as session: try: yield session finally: await session.close()`. + + For the SQLAlchemy `metadata` reserved-name issue on `AuditLog`: declare the column as `metadata_: Mapped[dict | None] = mapped_column("metadata", JSONB, nullable=True)` so the ORM attribute is `metadata_` but the DB column is named `metadata`. Document this in a code comment. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c " +from db.models import Base, User, Quota, RefreshToken, Folder, Document, Topic, DocumentTopic, Share, AuditLog, CloudConnection, Group +expected = {'users','quotas','refresh_tokens','folders','documents','topics','document_topics','shares','audit_log','cloud_connections','groups'} +actual = set(Base.metadata.tables.keys()) +assert expected == actual, f'mismatch: missing={expected-actual} extra={actual-expected}' +assert Document.__table__.c.user_id.nullable is True, 'documents.user_id must be nullable (D-03)' +assert Quota.__table__.c.limit_bytes.default.arg == 104857600, 'quota limit_bytes default must be 100MB' +from db.session import engine, AsyncSessionLocal +from deps.db import get_db +print('models-ok') +" + + + - `backend/db/models.py` exists and the verify command prints `models-ok` exiting 0 + - File contains exactly 11 `__tablename__ = ""` lines covering all of `users`, `quotas`, `refresh_tokens`, `folders`, `documents`, `topics`, `document_topics`, `shares`, `audit_log`, `cloud_connections`, `groups` (verifiable via `grep -E "__tablename__ = \"(users|quotas|refresh_tokens|folders|documents|topics|document_topics|shares|audit_log|cloud_connections|groups)\"" backend/db/models.py | sort -u | wc -l` equals 11) + - File contains `nullable=True` on the `Document.user_id` mapped_column declaration (D-03) + - File contains `class Base(DeclarativeBase)` + - File imports `UUID`, `INET`, `JSONB` from `sqlalchemy.dialects.postgresql` + - File contains `mapped_column("metadata", JSONB` for `AuditLog.metadata_` (avoids the reserved-attribute conflict) + - `backend/db/session.py` contains `create_async_engine(settings.database_url` + - `backend/db/session.py` contains `expire_on_commit=False` + - `backend/db/session.py` contains `pool_pre_ping=True` + - `backend/deps/db.py` contains `async def get_db()` + - `backend/deps/db.py` contains `async with AsyncSessionLocal() as session:` + - Files `backend/db/__init__.py` and `backend/deps/__init__.py` exist (empty is acceptable) + - From the backend dir, `python3 -c "from db.models import Base; from db.session import engine, AsyncSessionLocal; from deps.db import get_db"` exits 0 + - `Base.metadata.tables['documents'].c.user_id.nullable` is `True` (verifiable through the verify command's inline assertion) + + The full v1 ORM schema is declared, the async engine + session factory is wired, and the FastAPI dependency is exposed. Plans 04 and 05 can now import these symbols. + + + + Task 2: Scaffold Alembic async + author 0001_initial_schema.py migration + backend/alembic.ini, backend/migrations/env.py, backend/migrations/script.py.mako, backend/migrations/versions/0001_initial_schema.py + + - `alembic.ini` declares `script_location = migrations` and `sqlalchemy.url = %(DATABASE_MIGRATE_URL)s` so Alembic reads the migration DSN from the environment + - `migrations/env.py` uses `async_engine_from_config` + `asyncio.run(run_async_migrations())` and imports `from db.models import Base` so `target_metadata = Base.metadata` is populated (Pitfall 2) + - `migrations/versions/0001_initial_schema.py` has `revision = "0001"`, `down_revision = None`, and an `upgrade()` that creates all 11 tables in dependency order (users → quotas, folders, refresh_tokens, topics → documents → document_topics, shares, audit_log → cloud_connections; groups standalone) + - `upgrade()` ends with two `op.execute()` calls: (a) `GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO docuvault_app;` and (b) `ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO docuvault_app;` (Pitfall 4) + - `downgrade()` is a no-op or drops tables in reverse dependency order (Phase 1 will never call downgrade in practice but a defensive drop is acceptable) + - All UUID primary keys use `sa.dialects.postgresql.UUID(as_uuid=True)` and a `sa.text("gen_random_uuid()")` server_default OR are populated by application-level `default=uuid.uuid4` (matching the ORM declaration) + + + - backend/db/models.py (Task 1 output — schema definitions the migration must materialize) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 2 — Alembic async env.py; Pitfall 2 — must import models; Pitfall 4 — ALTER DEFAULT PRIVILEGES inside migration) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/alembic.ini section + backend/migrations/env.py section) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-13 two DSNs; D-14 init script handles role creation, not table grants) + - .env.example (Plan 01 output — confirm `DATABASE_MIGRATE_URL` variable name) + + + From the `backend/` directory, run `alembic init -t async migrations` once to generate the standard async template (`migrations/env.py`, `migrations/script.py.mako`, `migrations/versions/`, and a root `alembic.ini`). Then make these specific edits: + + **`alembic.ini`**: set `script_location = migrations`. Set `sqlalchemy.url = %(DATABASE_MIGRATE_URL)s`. Remove any leftover example URL. Leave other defaults from the template. + + **`migrations/env.py`**: replace the placeholder `target_metadata = None` with the block per RESEARCH.md Pattern 2: + `from db.models import Base # noqa: F401 — must import to register all models (Pitfall 2)` + `target_metadata = Base.metadata` + Also add `import os` at the top, and BEFORE `config = context.config`, inject the runtime DSN by calling `config.set_main_option("sqlalchemy.url", os.environ.get("DATABASE_MIGRATE_URL", config.get_main_option("sqlalchemy.url") or ""))` — this is necessary because `%(DATABASE_MIGRATE_URL)s` interpolation in `alembic.ini` only reads from `[alembic]` section variables, not OS env. Keep the rest of the generated `run_migrations_offline` / `run_async_migrations` / `run_migrations_online` structure verbatim from the template. + + **`migrations/versions/0001_initial_schema.py`**: hand-author this file (do NOT use `alembic revision --autogenerate` here — Phase 1 has no prior schema to diff). File header sets `revision = "0001"`, `down_revision = None`, `branch_labels = None`, `depends_on = None`. The `upgrade()` function uses `op.create_table()` calls for all 11 tables matching the ORM declarations in `backend/db/models.py`, with `op.create_index()` calls for every `Index(...)` declared on the ORM side (`ix_refresh_tokens_user_revoked`, `ix_documents_user_folder`, `ix_documents_user_created`, `ix_shares_recipient`, `ix_audit_user_created`, `ix_audit_event_created`, `ix_cloud_connections_user`). Use `sa.dialects.postgresql.UUID(as_uuid=True)` for UUID columns, `sa.dialects.postgresql.INET()` for `audit_log.ip_address`, `sa.dialects.postgresql.JSONB()` for the `metadata` column (use the literal column name `metadata` here since SQL doesn't care). The `documents.user_id` column MUST be `nullable=True` (D-03). Add `sa.UniqueConstraint("user_id", "parent_id", "name", name="uq_folders_user_parent_name")` on folders, and similar named constraints for topics and shares. End the `upgrade()` function with these two literal statements: + + ``` + op.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO docuvault_app;") + op.execute("ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO docuvault_app;") + op.execute("GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO docuvault_app;") + op.execute("ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT USAGE, SELECT ON SEQUENCES TO docuvault_app;") + ``` + + The last two SEQUENCES grants are needed because the `audit_log.id` autoincrement uses a sequence and the `docuvault_app` user must be able to call `nextval()`. Add a comment block above these statements explaining "Pitfall 4: ALTER DEFAULT PRIVILEGES required so future migrations inherit grants automatically; the `docuvault_app` user is created in `docker/postgres/initdb.d/01-init-users.sql` with CONNECT but no table privileges." + + Implement `downgrade()` to drop all tables in reverse dependency order: `op.drop_table("cloud_connections")`, then `audit_log`, `shares`, `document_topics`, `topics` (after document_topics), `documents`, `folders`, `refresh_tokens`, `quotas`, `groups`, `users`. Drop indexes before dropping tables where applicable. + + Keep `migrations/script.py.mako` exactly as generated by `alembic init`. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c " +import os, configparser +cp = configparser.ConfigParser() +cp.read('alembic.ini') +assert cp.get('alembic', 'script_location') == 'migrations' +assert '%(DATABASE_MIGRATE_URL)s' in cp.get('alembic', 'sqlalchemy.url') +print('alembic-ini-ok') +" && grep -q "from db.models import Base" backend/migrations/env.py && grep -q "target_metadata = Base.metadata" backend/migrations/env.py && grep -q "ALTER DEFAULT PRIVILEGES" backend/migrations/versions/0001_initial_schema.py && grep -q "GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES" backend/migrations/versions/0001_initial_schema.py && echo "migration-files-ok" + + + - `backend/alembic.ini` exists and the verify script prints `alembic-ini-ok` + - `backend/migrations/env.py` exists and contains the line `from db.models import Base` + - `backend/migrations/env.py` contains `target_metadata = Base.metadata` + - `backend/migrations/env.py` contains `async_engine_from_config` (preserved from template) + - `backend/migrations/env.py` contains `config.set_main_option("sqlalchemy.url", os.environ.get("DATABASE_MIGRATE_URL"` + - `backend/migrations/script.py.mako` exists (from `alembic init`) + - `backend/migrations/versions/0001_initial_schema.py` exists and contains `revision = "0001"` + - `backend/migrations/versions/0001_initial_schema.py` contains `op.create_table` exactly 11 times (one per table — verifiable via `grep -c "^ op.create_table" backend/migrations/versions/0001_initial_schema.py | grep -q "^11$"`) + - The migration contains all 11 table-name literals as the first positional arg to `op.create_table` (verifiable: `for t in users quotas refresh_tokens folders documents topics document_topics shares audit_log cloud_connections groups; do grep -q "op.create_table(\"$t\"" backend/migrations/versions/0001_initial_schema.py || echo MISSING:$t; done` produces no output) + - The migration contains both `op.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES` and `op.execute("ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT` + - The migration contains `op.execute("GRANT USAGE, SELECT ON ALL SEQUENCES` + - The `documents` table creation block contains `nullable=True` on the `user_id` column line (D-03) + - The migration contains `sa.dialects.postgresql.INET` (for `audit_log.ip_address`) + - The migration contains `sa.dialects.postgresql.JSONB` (for `audit_log.metadata`) + - `cd backend && python3 -c "from alembic.config import Config; from alembic.script import ScriptDirectory; cfg = Config('alembic.ini'); sd = ScriptDirectory.from_config(cfg); revs = list(sd.walk_revisions()); assert any(r.revision == '0001' for r in revs), 'revision 0001 not found'; print('script-discovery-ok')"` exits 0 with `script-discovery-ok` + + Alembic is scaffolded with the async template; `env.py` imports the Base metadata so autogenerate would work; the initial migration creates all 11 tables and issues both immediate and default-privilege GRANTs for `docuvault_app`. + + + + Task 3: Boot PostgreSQL via Docker Compose and run `alembic upgrade head` against the live DB + (verification only) + + - docker-compose.yml (Plan 01 output) + - .env.example (Plan 01 output — values to copy into .env) + - backend/migrations/versions/0001_initial_schema.py (Task 2 output) + + + Plans 01-03 together: a `docker-compose.yml` with a healthy `postgres` service, a `docuvault_app` + `docuvault_migrate` PostgreSQL user pair, an Alembic async config, and an initial migration that creates the full v1 schema. We now boot PostgreSQL and apply the migration as the final acceptance step for ROADMAP.md Phase 1 success criterion #2: "Running `alembic upgrade head` applies the initial migration cleanly against the fresh PostgreSQL instance with no errors." + + + Run these commands from the project root and confirm each one exits 0 and produces the expected output: + + 1. `cp .env.example .env` (only if `.env` does not already exist — never overwrite a real `.env`) + 2. `docker compose up -d postgres` — wait ~15 seconds, then `docker compose ps postgres` and confirm the `STATUS` column shows `Up (healthy)`. If `unhealthy`, run `docker compose logs postgres` and report the first error. + 3. From inside the backend container: `docker compose run --rm backend bash -lc "cd /app && alembic upgrade head"` — confirm the output ends with `Running upgrade -> 0001, ` and exits 0. No `error`, `permission denied`, or `relation already exists` messages may appear. + 4. Confirm all 11 tables exist: + `docker compose exec postgres psql -U postgres -d docuvault -c "\dt"` + — output must list all 11 tables: `users`, `quotas`, `refresh_tokens`, `folders`, `documents`, `topics`, `document_topics`, `shares`, `audit_log`, `cloud_connections`, `groups`. + 5. Confirm `documents.user_id` is nullable: + `docker compose exec postgres psql -U postgres -d docuvault -c "SELECT column_name, is_nullable FROM information_schema.columns WHERE table_name='documents' AND column_name='user_id';"` + — output must show `user_id | YES`. + 6. Confirm `docuvault_app` can read tables (default-privileges grant): + `docker compose exec postgres psql -U docuvault_app -d docuvault -c "SELECT count(*) FROM users;"` + — output must show `0` (zero rows, no permission error). + 7. Re-run `cd backend && python3 -m pytest tests/test_alembic.py -v` from the host (or `docker compose run --rm backend bash -lc "cd /app && pytest tests/test_alembic.py -v"`) — both tests previously marked XFAIL should now report XPASSED (or PASSED if the xfail marker was removed) — at minimum, neither should report FAILED. + + + All 7 verification commands succeed; the migration applies cleanly; `docuvault_app` has working SELECT access; the two test_alembic.py scaffolds pass. + + + Common failures and fixes: + - `permission denied for schema public`: the `ALTER DEFAULT PRIVILEGES` grant must run as the `docuvault_migrate` user. Either re-run the migration as that user (set `DATABASE_MIGRATE_URL` to use `docuvault_migrate`), or temporarily run the migration as `postgres` (superuser) for the very first run. Fix the migration approach and retry. + - `relation "alembic_version" does not exist`: this is the first migration on an empty DB — normal; not an error. + - `alembic.util.exc.CommandError: Can't locate revision`: the `env.py` likely failed to import `db.models`. Add `import sys; sys.path.insert(0, '/app')` to the top of `env.py` if needed. + - Healthcheck stuck `starting`: increase `start_period` on the postgres service to `30s`. + + Type "approved" once all 7 verification steps pass, or describe the specific failure to resume planning with a fix. + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| Alembic process → PostgreSQL | Connects with `docuvault_migrate` (DDL privileges); never used by the runtime app | +| Runtime app → PostgreSQL | Connects with `docuvault_app` (DML only, no DDL); granted via the migration's `ALTER DEFAULT PRIVILEGES` | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-01-03-01 | Elevation of Privilege | App connecting with DDL-capable user | mitigate | Migration uses `DATABASE_MIGRATE_URL` (docuvault_migrate); `db/session.py` uses `settings.database_url` (docuvault_app). The two DSNs are read from disjoint env vars; cannot be cross-wired without a config change (D-13). | +| T-01-03-02 | Tampering | SQL injection via ORM string concatenation | mitigate | All schema access is through SQLAlchemy ORM with `mapped_column` declarations; no raw string interpolation in models or migration (SEC-03 / CLAUDE.md). The only raw SQL strings in the migration are constant DDL/GRANT statements with no user input. | +| T-01-03-03 | Information Disclosure | `audit_log.metadata` JSONB column leaking PII | accept | Phase 1 has no audit-log writes; `audit_log` table is created empty (D-01 schema seeding). Phase 2/4 must enforce no-document-content rule on writes (SEC-07, ADMIN-06). Documented in SKELETON.md "Out of Scope". | +| T-01-03-04 | Tampering | Migration applied as superuser leaves orphan privileges | mitigate | Migration ends with explicit `GRANT ... TO docuvault_app` and `ALTER DEFAULT PRIVILEGES ... TO docuvault_app` (Pitfall 4); only `docuvault_app` gets DML; superuser is not used by the runtime app. | +| T-01-03-SC | Tampering | npm/pip installs | N/A | No new package installs in this plan; uses sqlalchemy/psycopg/alembic from Plan 01. RESEARCH.md audit covers all three (slopcheck OK). | + + + +- Tasks 1 + 2 are autonomous; Task 3 is a blocking human-verify checkpoint. +- After Task 3 approval: `cd backend && python3 -m pytest tests/test_alembic.py -v` must show both tests as XPASSED or PASSED (no FAILED). +- `docker compose exec postgres psql -U postgres -d docuvault -c "\dt"` lists all 11 tables. +- `docker compose exec postgres psql -U docuvault_app -d docuvault -c "SELECT count(*) FROM users;"` returns `0` without permission error. + + + +- `backend/db/models.py` declares the full v1 schema (11 tables) with `Document.user_id` nullable (D-03) and `Group` table stub (D-02). +- `backend/db/session.py` and `backend/deps/db.py` provide the async engine, session factory, and FastAPI dependency. +- `backend/alembic.ini`, `backend/migrations/env.py`, and `backend/migrations/versions/0001_initial_schema.py` exist and work end-to-end. +- `alembic upgrade head` against the live Docker PostgreSQL applies cleanly — ROADMAP.md Phase 1 success criterion #2 met. +- `docuvault_app` user has working SELECT/INSERT/UPDATE/DELETE on all tables (D-14 + Pitfall 4 fix). +- `tests/test_alembic.py` tests stop xfailing. + + + +Create `.planning/phases/01-infrastructure-foundation/01-03-SUMMARY.md` when done. Include: full table list materialized, the exact Alembic command output from Task 3, any deviations from the schema in RESEARCH.md (e.g., the `metadata_` rename), and the PostgreSQL version that actually booted (image tag from `docker compose images postgres`). + diff --git a/.planning/phases/01-infrastructure-foundation/01-04-PLAN.md b/.planning/phases/01-infrastructure-foundation/01-04-PLAN.md new file mode 100644 index 0000000..ed6a346 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-04-PLAN.md @@ -0,0 +1,360 @@ +--- +phase: 01-infrastructure-foundation +plan: 04 +type: execute +wave: 3 +depends_on: + - 01-03 +files_modified: + - backend/storage/__init__.py + - backend/storage/base.py + - backend/storage/minio_backend.py + - backend/services/storage.py +autonomous: true +requirements: + - STORE-01 + - STORE-02 + - STORE-07 +user_setup: [] +tags: + - storage + - minio + - sqlalchemy + - service-layer + - abc + +must_haves: + truths: + - "`backend/storage/base.py` declares `StorageBackend` ABC with five abstract methods matching the contract in Plan 02's test_storage.py" + - "`backend/storage/__init__.py` exports a `get_storage_backend()` factory returning a `MinIOBackend` instance configured from `config.settings`" + - "`backend/storage/minio_backend.py` implements all five `StorageBackend` methods; every call to the synchronous `Minio` SDK is wrapped in `asyncio.to_thread()`" + - "`MinIOBackend.put_object` produces object keys matching `{user_id}/{document_id}/{uuid4()}{ext}` exactly (D-06, STORE-02)" + - "The original human filename is never passed into the MinIO SDK and never appears in the returned object key" + - "`backend/services/storage.py` is entirely rewritten — no `filelock`, no flat-file I/O, no `json.loads/dumps` on metadata; every function is `async def` and accepts an `AsyncSession`" + - "The new `services/storage.py` preserves the existing function names so `api/documents.py` and `api/topics.py` can adopt the async signatures with minimal code change in Plan 05" + - "All six Plan 02 `tests/test_storage.py` tests flip from XFAIL to PASSED after this plan ships" + artifacts: + - path: "backend/storage/base.py" + provides: "StorageBackend ABC mirroring backend/ai/base.py" + contains: "class StorageBackend(ABC)" + - path: "backend/storage/__init__.py" + provides: "get_storage_backend() factory mirroring backend/ai/__init__.py" + contains: "def get_storage_backend" + - path: "backend/storage/minio_backend.py" + provides: "MinIO implementation of StorageBackend with asyncio.to_thread wrapping" + contains: "class MinIOBackend(StorageBackend)" + min_lines: 80 + - path: "backend/services/storage.py" + provides: "Async PostgreSQL+MinIO orchestrator replacing the flat-file implementation" + contains: "from db.models import Document" + min_lines: 120 + key_links: + - from: "backend/storage/__init__.py" + to: "backend/config.settings" + via: "factory reads minio_endpoint / minio_access_key / minio_secret_key / minio_bucket" + pattern: "settings\\.minio_(endpoint|access_key|secret_key|bucket)" + - from: "backend/storage/minio_backend.py" + to: "asyncio.to_thread" + via: "every put_object/get_object/delete_object/presigned/bucket_exists call" + pattern: "asyncio\\.to_thread\\(self\\._client\\." + - from: "backend/services/storage.py" + to: "backend/db/models" + via: "Document, Topic, DocumentTopic ORM queries" + pattern: "Document|Topic|DocumentTopic" + - from: "backend/services/storage.py" + to: "backend/storage.get_storage_backend" + via: "module-level call to obtain the configured StorageBackend" + pattern: "get_storage_backend" +--- + + +Build the storage abstraction layer (StorageBackend ABC + MinIO implementation following the established `backend/ai/` provider pattern) AND rewrite `backend/services/storage.py` to use async SQLAlchemy ORM + MinIO instead of flat-file JSON + filelock (D-05). Preserve the existing public function names (`save_upload`, `save_metadata`, `get_metadata`, `list_metadata`, `delete_document`, `update_document_topics`, `load_topics`, `save_topics`, `get_topic`, `create_topic`, `update_topic`, `delete_topic`, `topic_doc_counts`, `load_settings`, `save_settings`, `mask_api_key`, `settings_masked`) but change every function signature to `async def` that accepts an `AsyncSession` as the first parameter where ORM access is required. This plan does NOT yet wire the API routes — Plan 05 swaps callers from sync to async. + +Purpose: This is the heart of the walking skeleton. Without these modules, no document bytes can land in MinIO and no metadata can land in PostgreSQL. Plan 05 will compose this layer into the API + lifespan + Celery flow. + +Output: Three new files under `backend/storage/`, one fully rewritten `backend/services/storage.py`, and all six Plan 02 `tests/test_storage.py` scaffolds flipping to PASSED. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-infrastructure-foundation/01-CONTEXT.md +@.planning/phases/01-infrastructure-foundation/01-RESEARCH.md +@.planning/phases/01-infrastructure-foundation/01-PATTERNS.md +@.planning/phases/01-infrastructure-foundation/SKELETON.md +@.planning/phases/01-infrastructure-foundation/01-03-SUMMARY.md +@backend/ai/base.py +@backend/ai/__init__.py +@backend/ai/openai_provider.py + + +Project's established ABC + factory pattern (from `backend/ai/base.py` + `backend/ai/__init__.py`, lines 1-36 of each — read both files before implementing): + +```python +# backend/ai/base.py — pattern to mirror +class AIProvider(ABC): + @abstractmethod + async def classify(...) -> ClassificationResult: ... + @abstractmethod + async def health_check(self) -> bool: ... + +# backend/ai/__init__.py — factory pattern to mirror +def get_provider(settings: dict) -> AIProvider: + ... + match active: + case "openai": + return OpenAIProvider(...) + case _: + raise ValueError(...) +``` + +Existing `services/storage.py` public surface that `api/documents.py` and `api/topics.py` and `services/classifier.py` currently call (sync): +- `save_upload(file_bytes, original_name, mime_type) -> dict` (returns `{"id", "filename", "path"}`) +- `save_metadata(meta: dict) -> None` +- `get_metadata(doc_id: str) -> dict | None` +- `list_metadata(topic: str | None = None) -> list[dict]` +- `delete_document(doc_id: str) -> bool` +- `update_document_topics(doc_id: str, topics: list[str]) -> dict | None` +- `remove_topic_from_all_documents(topic_name: str) -> int` +- `load_topics() -> list[dict]` +- `save_topics(topics: list[dict]) -> None` +- `get_topic(topic_id: str) -> dict | None` +- `create_topic(name: str, description: str = "", color: str = "#6366f1") -> dict` +- `update_topic(topic_id: str, **kwargs) -> dict | None` +- `delete_topic(topic_id: str) -> str | None` (returns the deleted topic name) +- `topic_doc_counts() -> dict[str, int]` +- `load_settings() -> dict` +- `save_settings(settings: dict) -> None` +- `mask_api_key(key: str) -> str` +- `settings_masked(settings: dict) -> dict` + +New `services/storage.py` MUST keep all of those names but accept an `AsyncSession` parameter where DB access is needed, except `mask_api_key` and `settings_masked` (pure functions — keep sync). + +Schema fields the service layer reads/writes (declared in `backend/db/models.py` from Plan 03): +- `Document.id, user_id, folder_id, filename, object_key, content_type, size_bytes, storage_backend, extracted_text, status, created_at, updated_at` +- `Topic.id, user_id (NULLABLE — Phase 1 has no users), name, description, color` +- `DocumentTopic.document_id, topic_id` (composite PK association table) + +Settings persistence note: Phase 1 keeps `services/storage.load_settings()` / `save_settings()` reading the flat file `SETTINGS_FILE` (`/app/data/settings.json`) because the `users.ai_provider` / `users.ai_model` schema cannot be populated until Phase 2. Document this with a `# Phase 2 will migrate this to DB-backed per-user settings (D-03 deferred to user-scoped column population)` comment. + + + + + + + Task 1: Create backend/storage/ — StorageBackend ABC + factory + MinIOBackend implementation + backend/storage/__init__.py, backend/storage/base.py, backend/storage/minio_backend.py + + - Defining a subclass of `StorageBackend` that omits any of the five abstract methods (`put_object`, `get_object`, `delete_object`, `presigned_get_url`, `health_check`) and trying to instantiate it raises `TypeError` + - `get_storage_backend()` returns a `MinIOBackend` instance whose endpoint/access_key/secret_key/bucket come from `config.settings.minio_*` + - `await backend.put_object(user_id="u1", document_id="d1", file_bytes=b"abc", extension=".txt", content_type="text/plain")` returns a string of the form `u1/d1/.txt` + - The middle UUID segment of the returned key is a freshly generated `uuid4()` value — not the user_id and not the document_id + - The human filename is NEVER passed into the MinIO SDK call — only the constructed object key, content type, length, and `io.BytesIO(file_bytes)` are passed + - Every call to the synchronous `Minio` SDK (`put_object`, `get_object`, `remove_object`, `presigned_get_object`, `bucket_exists`, `make_bucket`) goes through `asyncio.to_thread(self._client., ...)` + - `await backend.health_check()` returns `True` when `bucket_exists` returns `True`; returns `False` when `bucket_exists` raises any exception + - `await backend.get_object(key)` returns the file bytes; under the hood it calls `self._client.get_object(...)` (which returns an `HTTPResponse`), then `response.read()`, then `response.close()` + `response.release_conn()` — all inside `asyncio.to_thread` + + + - backend/ai/base.py (the ABC pattern to mirror — read in full) + - backend/ai/__init__.py (the factory pattern to mirror — read in full) + - backend/ai/openai_provider.py (the concrete ABC implementation pattern — `__init__`, `_client` attribute, async methods, try/except `health_check`) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 3 lines 343-419 — MinIO sync-in-async; Pattern 8 lines 607-651 — StorageBackend ABC code; Pitfall 3 — BytesIO seek(0); Code Examples lines 920-928 — config fields) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/storage/* sections) + - backend/tests/test_storage.py (Plan 02 output — the six xfail tests defining the contract this plan must satisfy) + - backend/config.py (Plan 01 output — confirm Settings field names match what factory reads) + + + Create `backend/storage/__init__.py`. Inside it: `from storage.base import StorageBackend`, `from storage.minio_backend import MinIOBackend`, `from config import settings`. Define `def get_storage_backend() -> StorageBackend:` that returns `MinIOBackend(endpoint=settings.minio_endpoint, access_key=settings.minio_access_key, secret_key=settings.minio_secret_key, bucket=settings.minio_bucket, secure=False)`. The `secure=False` is correct for Docker internal HTTP traffic between containers — RESEARCH.md Pattern 3. Add a module docstring noting this file mirrors `backend/ai/__init__.py`. + + Create `backend/storage/base.py` with the exact contents from RESEARCH.md Pattern 8 (lines 612-640): `from abc import ABC, abstractmethod`, `class StorageBackend(ABC):` with five `@abstractmethod async def` methods — `put_object(self, user_id: str, document_id: str, file_bytes: bytes, extension: str, content_type: str) -> str`, `get_object(self, object_key: str) -> bytes`, `delete_object(self, object_key: str) -> None`, `presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str`, `health_check(self) -> bool`. Each abstract method has a one-line docstring per RESEARCH.md. + + Create `backend/storage/minio_backend.py` implementing `MinIOBackend(StorageBackend)`. Imports: `import asyncio`, `import io`, `import uuid`, `from datetime import timedelta`, `from minio import Minio`, `from storage.base import StorageBackend`. Constructor `def __init__(self, endpoint: str, access_key: str, secret_key: str, bucket: str, secure: bool = False)` stores `self._bucket = bucket` and `self._client = Minio(endpoint=endpoint, access_key=access_key, secret_key=secret_key, secure=secure)`. + + Implement `async def put_object(self, user_id: str, document_id: str, file_bytes: bytes, extension: str, content_type: str) -> str`: construct `object_key = f"{user_id}/{document_id}/{uuid.uuid4()}{extension}"`. Build `data = io.BytesIO(file_bytes)` (the constructor sets pointer at 0 — no `seek(0)` needed per Pitfall 3, but add an explicit `data.seek(0)` immediately afterward as a belt-and-braces safety). Call `await asyncio.to_thread(self._client.put_object, self._bucket, object_key, data, length=len(file_bytes), content_type=content_type)`. Return `object_key`. IMPORTANT: do NOT accept any `filename`/`original_name` parameter — STORE-02 requires the human filename never reaches this method. + + Implement `async def get_object(self, object_key: str) -> bytes`: define a sync helper `def _fetch():` inside the method that calls `response = self._client.get_object(self._bucket, object_key)` then `try: return response.read() finally: response.close(); response.release_conn()`. Call `return await asyncio.to_thread(_fetch)`. This pattern correctly handles MinIO's HTTPResponse-returning `get_object`. + + Implement `async def delete_object(self, object_key: str) -> None`: `await asyncio.to_thread(self._client.remove_object, self._bucket, object_key)`. + + Implement `async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str`: `return await asyncio.to_thread(self._client.presigned_get_object, self._bucket, object_key, timedelta(minutes=expires_minutes))`. + + Implement `async def health_check(self) -> bool`: wrap `await asyncio.to_thread(self._client.bucket_exists, self._bucket)` in a `try/except Exception: return False`; the truthy path returns the exact boolean from the SDK. + + Finally, after writing the implementation, edit the six tests in `backend/tests/test_storage.py` to REMOVE the `@pytest.mark.xfail(strict=False, reason="implemented in plan 04")` markers (the tests should now PASS unmarked) — but keep the existing `try/except ImportError: pytest.skip(...)` blocks because they harmlessly fall through when imports succeed. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -m pytest tests/test_storage.py -v 2>&1 | tail -20 && python3 -c " +from storage.base import StorageBackend +from storage import get_storage_backend +from storage.minio_backend import MinIOBackend +assert isinstance(get_storage_backend(), MinIOBackend) +abstract = {m for m in StorageBackend.__abstractmethods__} +assert abstract == {'put_object','get_object','delete_object','presigned_get_url','health_check'}, f'abstract mismatch: {abstract}' +print('storage-abc-ok') +" + + + - `backend/storage/base.py` exists and contains `class StorageBackend(ABC):` + - `backend/storage/base.py` declares exactly 5 `@abstractmethod` methods (verifiable via `grep -c "@abstractmethod" backend/storage/base.py >= 5`) + - The 5 abstract method names are `put_object`, `get_object`, `delete_object`, `presigned_get_url`, `health_check` (verified by the inline `__abstractmethods__` assertion in the Verify command) + - `backend/storage/__init__.py` contains `def get_storage_backend() -> StorageBackend:` and returns a `MinIOBackend(endpoint=settings.minio_endpoint, ...)` + - `backend/storage/minio_backend.py` contains `class MinIOBackend(StorageBackend):` + - `backend/storage/minio_backend.py` calls `asyncio.to_thread(self._client.put_object`, `asyncio.to_thread(self._client.remove_object`, `asyncio.to_thread(self._client.presigned_get_object`, `asyncio.to_thread(self._client.bucket_exists` (each verifiable via grep — each must appear at least once) + - `backend/storage/minio_backend.py` contains the literal substring `f"{user_id}/{document_id}/{uuid.uuid4()}{extension}"` (the STORE-02 key format) + - `backend/storage/minio_backend.py` does NOT take any `filename` or `original_name` parameter on `put_object` (verifiable: `grep -E "def put_object.*filename|def put_object.*original_name" backend/storage/minio_backend.py` exits with no matches) + - `cd backend && python3 -m pytest tests/test_storage.py -v` exits 0 with all 6 tests reporting PASSED (no XFAIL, no SKIPPED, no FAILED) — verifiable via `python3 -m pytest tests/test_storage.py -v 2>&1 | grep -c "PASSED" >= 6` + - The `@pytest.mark.xfail` markers in `backend/tests/test_storage.py` are removed (verifiable: `grep -c "@pytest.mark.xfail" backend/tests/test_storage.py | grep -q "^0$"`) + + The `storage/` module mirrors the `ai/` provider pattern exactly; `MinIOBackend` correctly wraps every sync SDK call in `asyncio.to_thread`; STORE-02 object key format is enforced in code; all six Plan 02 `test_storage.py` scaffolds pass. + + + + Task 2: Rewrite backend/services/storage.py — async SQLAlchemy + MinIO orchestrator + backend/services/storage.py + + - `save_upload(session, file_bytes, original_name, mime_type) -> dict`: creates a `Document` row with a freshly generated UUID `id`, `filename=original_name`, `content_type=mime_type`, `size_bytes=len(file_bytes)`, `user_id=None` (D-03 — no auth in Phase 1), `storage_backend="minio"`, `status="pending"`; calls `await get_storage_backend().put_object(user_id=str(user_id or "null-user"), document_id=str(doc.id), file_bytes=file_bytes, extension=Path(original_name).suffix.lower(), content_type=mime_type)`; stores the returned `object_key` on the `Document.object_key` column; commits; returns a dict shape compatible with the existing API: `{"id": str(doc.id), "filename": original_name, "path": object_key, "object_key": object_key}` (the `path` key is kept for compatibility with the current `api/documents.py` line 33 — it now holds the object_key, not a filesystem path) + - `save_metadata(session, meta: dict) -> None`: idempotent update of a `Document` row from the dict shape used by the legacy `api/documents.py` (`id`, `original_name`, `extracted_text`, `topics`, `classified_at`, etc.); maps `meta["original_name"] → Document.filename`, `meta["mime_type"] → Document.content_type`, `meta["size_bytes"] → Document.size_bytes`, `meta["extracted_text"] → Document.extracted_text`; `topics` list is reconciled into the `DocumentTopic` association table (delete existing rows for the document, insert one per topic name — auto-creating topics that don't yet exist via the existing `create_topic` helper); `classified_at → Document.updated_at` + - `get_metadata(session, doc_id: str) -> dict | None`: returns `None` for not-found; otherwise returns a dict matching the response shape `api/documents.py` produces today: `{"id", "original_name", "filename", "mime_type", "size_bytes", "extracted_text", "topics" (list of topic name strings), "created_at", "classified_at"}` + - `list_metadata(session, topic: str | None = None) -> list[dict]`: returns the same dict shape per row; if `topic` is provided, filters to documents joined to a `Topic` row whose `name == topic` + - `delete_document(session, doc_id: str) -> bool`: returns `False` if the document does not exist; otherwise calls `await get_storage_backend().delete_object(doc.object_key)`, deletes the `Document` row (cascade deletes `DocumentTopic` rows via the ORM relationship `ondelete="CASCADE"`), commits, returns `True` + - `update_document_topics(session, doc_id, topics) -> dict | None`: replaces the association rows and returns the refreshed metadata dict (or `None` for not-found) + - `remove_topic_from_all_documents(session, topic_name) -> int`: deletes all `DocumentTopic` rows whose `topic_id` matches the topic with that name; returns the count + - Topic functions (`load_topics`, `create_topic`, `update_topic`, `delete_topic`, `get_topic`, `topic_doc_counts`) are async and query the `Topic` and `DocumentTopic` tables; `create_topic` auto-deduplicates by lowercased name (preserving the existing behavior); response shape matches the existing JSON: `{"id", "name", "description", "color"}` per topic; topic `id` is the UUID stringified + - Settings functions (`load_settings`, `save_settings`, `mask_api_key`, `settings_masked`) continue to read/write the flat `SETTINGS_FILE` JSON in Phase 1 — these will move to the `users` table in Phase 2; document this with a comment + - NO `filelock` import remains in this module + - NO `json.loads` / `json.dumps` / `Path.read_text` / `Path.write_text` calls touch document or topic data (only the still-flat `SETTINGS_FILE` may keep using JSON) + + + - backend/services/storage.py (current 188-line file — read in full; understand every function signature and return shape so the rewrite preserves the contract) + - backend/db/models.py (Plan 03 output — exact column names and relationships) + - backend/api/documents.py (read in full — note exactly which `storage.*` calls are made and what return-shape fields are consumed) + - backend/api/topics.py (read in full — same scrutiny for topic-related calls) + - backend/services/classifier.py (read in full — uses `storage.get_metadata`, `storage.load_settings`, `storage.load_topics`, `storage.create_topic`, `storage.update_document_topics`) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/services/storage.py section — explains the contract preservation strategy) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pitfall 1 — expire_on_commit=False; Pattern 1 — session lifecycle) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-04 data dir deleted, D-05 storage layer switched, D-06 object key schema) + + + Replace `backend/services/storage.py` entirely. New imports: `import uuid`, `import json`, `from datetime import datetime, timezone`, `from pathlib import Path`, `from sqlalchemy import select, func as sql_func, delete`, `from sqlalchemy.ext.asyncio import AsyncSession`, `from sqlalchemy.orm import selectinload`, `from db.models import Document, Topic, DocumentTopic`, `from storage import get_storage_backend`, `from config import settings as cfg_settings, DEFAULT_SETTINGS, SETTINGS_FILE`. Do NOT import `filelock` — that dependency is removed. + + Add a module-level `_storage = None` and a `def _backend(): global _storage; _storage = _storage or get_storage_backend(); return _storage` helper so the MinIO backend is lazily instantiated once per process (matching the singleton-like behavior of the previous module-level filelock objects). + + Implement every function below as `async def` (except `mask_api_key`, `settings_masked` which stay sync). The session is the first positional parameter for every DB-touching function. + + 1. `async def save_upload(session: AsyncSession, file_bytes: bytes, original_name: str, mime_type: str) -> dict`: generate `doc_id = uuid.uuid4()`. Extract `suffix = Path(original_name).suffix.lower()`. Build `doc = Document(id=doc_id, user_id=None, filename=original_name, content_type=mime_type, size_bytes=len(file_bytes), storage_backend="minio", status="pending", object_key="")`. `session.add(doc)`. `await session.flush()` (to materialize `doc.id` without committing). Call `object_key = await _backend().put_object(user_id="null-user", document_id=str(doc_id), file_bytes=file_bytes, extension=suffix, content_type=mime_type)` — the literal sentinel `"null-user"` is used because there is no user in Phase 1 (D-03); Phase 2 will replace this with `str(current_user.id)`. Set `doc.object_key = object_key`. `await session.commit()`. Return `{"id": str(doc_id), "filename": original_name, "path": object_key, "object_key": object_key, "user_id": None}`. + + 2. `async def save_metadata(session: AsyncSession, meta: dict) -> None`: look up `doc = await session.get(Document, uuid.UUID(meta["id"]))`. If not found, return. Set `doc.extracted_text = meta.get("extracted_text", "")`. If `meta.get("topics")` is a non-empty list of strings, call `await update_document_topics(session, meta["id"], meta["topics"])` (which handles association table reconciliation). Set `doc.status = "classified" if meta.get("classified_at") else "pending"`. `await session.commit()`. + + 3. `async def get_metadata(session: AsyncSession, doc_id: str) -> dict | None`: try `uid = uuid.UUID(doc_id)` inside try/except `ValueError: return None` (keep the legacy string-doc-id tolerance from `api/documents.py`). `doc = await session.get(Document, uid)`. If `None`, return `None`. Load topics: `topics_q = await session.execute(select(Topic.name).join(DocumentTopic, DocumentTopic.topic_id == Topic.id).where(DocumentTopic.document_id == uid))`; `topic_names = [r[0] for r in topics_q]`. Return `{"id": str(doc.id), "original_name": doc.filename, "filename": doc.filename, "mime_type": doc.content_type, "size_bytes": doc.size_bytes, "extracted_text": doc.extracted_text or "", "topics": topic_names, "created_at": doc.created_at.isoformat() if doc.created_at else None, "classified_at": doc.updated_at.isoformat() if doc.status == "classified" else None}`. + + 4. `async def list_metadata(session: AsyncSession, topic: str | None = None) -> list[dict]`: base query `stmt = select(Document).order_by(Document.created_at.desc())`. If `topic` is provided, join `DocumentTopic` and `Topic` and filter `Topic.name == topic`. Execute, then for each `Document` call the same dict-build code as `get_metadata` (factor it out into a private `def _doc_to_dict(doc, topic_names)` helper). + + 5. `async def delete_document(session: AsyncSession, doc_id: str) -> bool`: lookup as above; if not found, return `False`. Call `await _backend().delete_object(doc.object_key)` inside try/except (log to stderr on failure but still proceed with DB delete — the bytes may already be gone). `await session.delete(doc)`. `await session.commit()`. Return `True`. + + 6. `async def update_document_topics(session: AsyncSession, doc_id: str, topics: list[str]) -> dict | None`: lookup the doc; return `None` if missing. Delete all existing `DocumentTopic` rows for this document via `await session.execute(delete(DocumentTopic).where(DocumentTopic.document_id == uid))`. For each name in `topics` (deduped), call `topic_dict = await create_topic(session, name)` (auto-create if missing) and insert a `DocumentTopic(document_id=uid, topic_id=uuid.UUID(topic_dict["id"]))`. Set `doc.status = "classified"`. `await session.commit()`. Return `await get_metadata(session, doc_id)`. + + 7. `async def remove_topic_from_all_documents(session: AsyncSession, topic_name: str) -> int`: find the topic by name; if missing, return 0. Run `result = await session.execute(delete(DocumentTopic).where(DocumentTopic.topic_id == topic.id))`. `await session.commit()`. Return `result.rowcount`. + + 8. `async def load_topics(session: AsyncSession) -> list[dict]`: `q = await session.execute(select(Topic).order_by(Topic.name))`. Return `[{"id": str(t.id), "name": t.name, "description": t.description, "color": t.color} for t in q.scalars()]`. + + 9. `async def save_topics(session: AsyncSession, topics: list[dict]) -> None`: idempotent bulk replace — delete all `Topic` rows, then insert the provided list. (Phase 1 use: not directly called by any current endpoint; preserved for legacy compatibility — add a `# legacy: not used by current endpoints` comment.) + + 10. `async def get_topic(session: AsyncSession, topic_id: str) -> dict | None`: `t = await session.get(Topic, uuid.UUID(topic_id))`; return `None` or `{"id": str(t.id), ...}`. + + 11. `async def create_topic(session: AsyncSession, name: str, description: str = "", color: str = "#6366f1") -> dict`: case-insensitive deduplication — `q = await session.execute(select(Topic).where(sql_func.lower(Topic.name) == name.lower()))`. If a row exists, return its dict shape. Otherwise insert a new `Topic(name=name, description=description, color=color)`, commit, return dict. + + 12. `async def update_topic(session: AsyncSession, topic_id: str, name=None, description=None, color=None) -> dict | None`: lookup; if missing, `None`. Update only non-None fields. Commit. Return dict. + + 13. `async def delete_topic(session: AsyncSession, topic_id: str) -> str | None`: lookup; if missing, `None`. Save the name. Delete the Topic (cascade removes DocumentTopic rows via `ondelete="CASCADE"`). Commit. Return the saved name. + + 14. `async def topic_doc_counts(session: AsyncSession) -> dict[str, int]`: `q = await session.execute(select(Topic.name, sql_func.count(DocumentTopic.document_id)).join(DocumentTopic, DocumentTopic.topic_id == Topic.id, isouter=True).group_by(Topic.name))`. Return `{name: count for name, count in q}`. + + 15. `def load_settings() -> dict`: keep sync; read `SETTINGS_FILE` (still flat JSON) — if file missing, return a deep copy of `DEFAULT_SETTINGS`. Add comment `# Phase 2 will move per-user settings to users.ai_provider / users.ai_model`. + + 16. `def save_settings(settings: dict) -> None`: keep sync; write `SETTINGS_FILE`. No filelock — Phase 1 settings file is single-writer. + + 17. `def mask_api_key(key: str) -> str` and `def settings_masked(settings: dict) -> dict`: keep verbatim from current file (pure functions). + + At the bottom of the file, expose every function via explicit `__all__ = ["save_upload", "save_metadata", "get_metadata", "list_metadata", "delete_document", "update_document_topics", "remove_topic_from_all_documents", "load_topics", "save_topics", "get_topic", "create_topic", "update_topic", "delete_topic", "topic_doc_counts", "load_settings", "save_settings", "mask_api_key", "settings_masked"]` so importers see the public surface. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c " +import inspect +import services.storage as st +async_funcs = ['save_upload','save_metadata','get_metadata','list_metadata','delete_document','update_document_topics','remove_topic_from_all_documents','load_topics','save_topics','get_topic','create_topic','update_topic','delete_topic','topic_doc_counts'] +sync_funcs = ['load_settings','save_settings','mask_api_key','settings_masked'] +for f in async_funcs: + assert inspect.iscoroutinefunction(getattr(st, f)), f'{f} should be async' +for f in sync_funcs: + fn = getattr(st, f) + assert not inspect.iscoroutinefunction(fn), f'{f} should be sync' +assert 'filelock' not in open(st.__file__).read(), 'filelock import still present' +assert 'FileLock' not in open(st.__file__).read(), 'FileLock import still present' +print('services-storage-rewrite-ok') +" + + + - `backend/services/storage.py` no longer imports `filelock` or `FileLock` (verifiable: `grep -cE "from filelock|import filelock|FileLock" backend/services/storage.py | grep -q "^0$"`) + - `backend/services/storage.py` imports `from sqlalchemy.ext.asyncio import AsyncSession` and `from db.models import Document, Topic, DocumentTopic` + - `backend/services/storage.py` imports `from storage import get_storage_backend` + - All 14 DB-touching functions are `async def` (verified by the inline `inspect.iscoroutinefunction` assertions in the Verify command) + - All 4 pure functions (`load_settings`, `save_settings`, `mask_api_key`, `settings_masked`) remain sync + - `save_upload` constructs an object_key via the MinIO backend (verifiable: `grep -c "await.*put_object" backend/services/storage.py >= 1`) + - `save_upload` passes `user_id="null-user"` (D-03 sentinel) — verifiable via `grep -c 'user_id="null-user"\|user_id=.null-user.' backend/services/storage.py >= 1` + - The file declares `__all__ = [...]` listing all 18 public functions (verifiable: `grep -c "__all__" backend/services/storage.py >= 1`) + - The file has at least 120 lines (the `must_haves.artifacts.min_lines` target — verifiable: `wc -l backend/services/storage.py | awk '{exit ($1 >= 120) ? 0 : 1}'`) + - No `Path(...).read_text()` or `Path(...).write_text()` call references `METADATA_DIR`, `UPLOADS_DIR`, `TOPICS_FILE` (verifiable: `grep -cE "METADATA_DIR|UPLOADS_DIR|TOPICS_FILE" backend/services/storage.py | grep -q "^0$"`) + - `SETTINGS_FILE` references remain (for `load_settings`/`save_settings` — verifiable: `grep -c "SETTINGS_FILE" backend/services/storage.py >= 1`) + - `cd backend && python3 -c "import services.storage; print('import-ok')"` exits 0 with `import-ok` + - All previous Plan 02 `test_storage.py` tests still PASSED (the rewrite must not have removed the `MinIOBackend` exports or the factory) — `cd backend && python3 -m pytest tests/test_storage.py -v` exits 0 with 6 PASSED + + `backend/services/storage.py` is fully async, all DB access goes through SQLAlchemy ORM, all object storage goes through the `MinIOBackend` via `get_storage_backend()`, no flat-file or filelock code remains for documents or topics. The public function names are preserved so Plan 05 can wire callers with `def → async def + await` plus a `session` parameter. + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| `services/storage.py` → `storage/minio_backend.py` | Internal Python call; bytes never cross a network boundary inside this module | +| `storage/minio_backend.py` → MinIO server | App connects with `MINIO_ACCESS_KEY` (app-level credentials, not root) over Docker internal HTTP | +| `services/storage.py` → PostgreSQL via `AsyncSession` | All queries parameterized via SQLAlchemy ORM (SEC-03) | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-01-04-01 | Tampering | Object key prediction / path traversal | mitigate | `MinIOBackend.put_object` constructs keys server-side as `{user_id}/{document_id}/{uuid4()}{ext}` (STORE-02, D-06); no caller can inject a key. The `extension` is derived from `Path(original_name).suffix.lower()` — the only piece of user input — and is concatenated as a suffix to a freshly generated UUID, so a malicious extension cannot escape the prefix (the leading `{user_id}/{document_id}/{uuid4}` portion is always server-controlled). Wave 0 tests `test_object_key_schema` + `test_filename_not_in_object_key` enforce this on every CI run. | +| T-01-04-02 | Information Disclosure | Human filename leaking into the object store namespace | mitigate | `MinIOBackend.put_object` does NOT accept a `filename` parameter (only `extension`). The human filename is stored only in the `documents.filename` DB column (D-06). Test `test_filename_not_in_object_key` asserts this on every run. | +| T-01-04-03 | Information Disclosure | Extension as a covert channel (e.g., `.html` for XSS via download) | accept | Phase 1 has no download flow exposed to the browser (no presigned URL endpoint, no proxied download endpoint). Phase 4 (DOC-02) will introduce PDF-only proxied preview and content-type validation. Documented in SKELETON.md. | +| T-01-04-04 | Tampering | SQL injection via ORM string concatenation | mitigate | All queries use `select(...)`, `delete(...)`, `session.execute(...)` with parameterized values — never f-strings or `.format()` for SQL. The only string interpolation is the MinIO object key, which is not SQL. SEC-03 / CLAUDE.md. | +| T-01-04-05 | Denial of Service | Unbounded list_metadata returning all documents | accept | Phase 1 has only the single test user's documents (≤ 100 MB of uploads from D-04 reset state). Phase 3 (STORE-03) introduces pagination + quota enforcement. The existing pagination in `api/documents.py` (`page`, `per_page`) remains in effect — Plan 05 will wire it. | +| T-01-04-SC | Tampering | npm/pip/cargo installs | N/A | No new installs in this plan; uses minio/sqlalchemy from Plan 01. RESEARCH.md Package Legitimacy Audit covers both (slopcheck OK). | + + + +- `cd backend && python3 -m pytest tests/test_storage.py -v` exits 0 with all 6 tests PASSED (no XFAIL). +- `cd backend && python3 -c "from services import storage; import inspect; assert inspect.iscoroutinefunction(storage.save_upload)"` exits 0. +- `grep -c "filelock\|FileLock" backend/services/storage.py` equals 0. +- The full test suite `cd backend && python3 -m pytest tests/ -v` still exits 0 — the legacy sync tests (`test_documents.py` un-suffixed names) still pass against the OLD flat-file behavior because Plan 05 has not yet swapped `api/documents.py` callers; this is intentional and documents the cutover boundary. + + + +- `backend/storage/base.py`, `backend/storage/__init__.py`, `backend/storage/minio_backend.py` exist; the ABC + factory pattern mirrors `backend/ai/` exactly. +- `backend/services/storage.py` is rewritten with async signatures, ORM access, and the MinIO backend integration; `filelock` is gone from this module. +- All six Plan 02 `tests/test_storage.py` tests pass (xfail markers removed). +- The legacy `tests/test_documents.py` sync tests continue to pass — the cutover happens in Plan 05; this plan is additive on the storage layer side. + + + +Create `.planning/phases/01-infrastructure-foundation/01-04-SUMMARY.md` when done. Include: full function-by-function summary of `services/storage.py` changes (old signature → new signature), the MinIO key format produced for a sample upload, and any helpers introduced (e.g., `_doc_to_dict`, `_backend()` singleton). + diff --git a/.planning/phases/01-infrastructure-foundation/01-05-PLAN.md b/.planning/phases/01-infrastructure-foundation/01-05-PLAN.md new file mode 100644 index 0000000..2003196 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-05-PLAN.md @@ -0,0 +1,604 @@ +--- +phase: 01-infrastructure-foundation +plan: 05 +type: execute +wave: 4 +depends_on: + - 01-04 +files_modified: + - backend/main.py + - backend/api/documents.py + - backend/api/topics.py + - backend/celery_app.py + - backend/tasks/__init__.py + - backend/tasks/document_tasks.py + - backend/services/classifier.py + - backend/config.py + - backend/tests/conftest.py + - backend/tests/test_documents.py + - backend/data +autonomous: false +requirements: + - STORE-01 + - STORE-07 +user_setup: [] +tags: + - api-wiring + - lifespan + - celery + - cutover + - walking-skeleton + +must_haves: + truths: + - "`backend/main.py` lifespan opens the MinIO client, auto-creates the `docuvault` bucket if missing, attaches it to `app.state.minio`, and disposes the SQLAlchemy engine on shutdown" + - "`backend/main.py` `/health` endpoint returns `{\"status\": \"ok\"|\"degraded\", \"checks\": {\"postgres\": \"ok\"|\"error: ...\", \"minio\": \"ok\"|\"error: ...\"}}` (D-07)" + - "`backend/api/documents.py` upload, list, get, delete, classify endpoints all inject `session: AsyncSession = Depends(get_db)` and call the async `services.storage.*` functions" + - "`backend/api/topics.py` list/create/update/delete/suggest endpoints all inject the session dependency" + - "`backend/celery_app.py` instantiates a Celery app with broker + result_backend from `REDIS_URL`, JSON serialization, and a `documents` queue route" + - "`backend/tasks/document_tasks.py` declares a sync `def extract_and_classify(document_id: str) -> dict` Celery task that the upload handler calls via `.delay(...)`" + - "FastAPI `BackgroundTasks` usage is removed (STORE-08 satisfied for Phase 1; was never directly used in current codebase but the `await classifier.classify_document` inline call in the upload handler is replaced with `.delay()`)" + - "All legacy flat-file constants and helpers (`UPLOADS_DIR`, `METADATA_DIR`, `TOPICS_FILE`, `ensure_data_dirs`) are removed from `backend/config.py`; the `data/` directory contents are deleted (D-04)" + - "Existing `tests/test_documents.py` sync tests are DELETED (cutover) and the `_async` variants from Plan 02 have their `@pytest.mark.xfail` markers removed; the legacy `client` (sync TestClient) fixture is removed from conftest.py" + - "Walking-skeleton end-to-end verification passes: `docker compose up` boots all services healthy; a real PDF upload through `POST /api/documents/upload` persists to PostgreSQL + MinIO; Celery worker logs show `extract_and_classify` ran; the document appears in `GET /api/documents`" + artifacts: + - path: "backend/main.py" + provides: "Lifespan with engine + MinIO bucket init; extended /health" + contains: "app.state.minio" + - path: "backend/celery_app.py" + provides: "Celery app with Redis broker + JSON serialization + documents queue" + contains: "Celery(\"docuvault\")" + - path: "backend/tasks/document_tasks.py" + provides: "extract_and_classify Celery task" + contains: "@celery_app.task" + - path: "backend/api/documents.py" + provides: "Async route handlers using AsyncSession + Celery .delay()" + contains: "session: AsyncSession = Depends(get_db)" + - path: "backend/api/topics.py" + provides: "Async route handlers using AsyncSession" + contains: "session: AsyncSession = Depends(get_db)" + - path: "backend/services/classifier.py" + provides: "Updated to accept session and called from a sync wrapper inside Celery tasks" + contains: "async def classify_document" + - path: "backend/config.py" + provides: "Cleaned up Pydantic Settings — legacy data-dir constants removed" + contains: "class Settings(BaseSettings)" + key_links: + - from: "backend/api/documents.py upload handler" + to: "backend/tasks/document_tasks.extract_and_classify" + via: "extract_and_classify.delay(str(saved['id']))" + pattern: "extract_and_classify\\.delay" + - from: "backend/main.py lifespan" + to: "MinIO bucket auto-create" + via: "make_bucket if not bucket_exists" + pattern: "make_bucket|bucket_exists" + - from: "backend/main.py /health" + to: "AsyncSessionLocal + minio_client.bucket_exists" + via: "probe queries" + pattern: "AsyncSessionLocal|bucket_exists" + - from: "backend/celery_app.py" + to: "REDIS_URL env var" + via: "os.environ.get('REDIS_URL', ...)" + pattern: "REDIS_URL" +--- + + +Complete the Phase 1 cutover: wire every API route to the async storage layer, replace inline classification with a Celery `.delay()` call, extend the FastAPI lifespan with MinIO bucket creation + engine disposal, rewrite `/health` to probe PostgreSQL + MinIO (D-07), introduce `celery_app.py` + `tasks/document_tasks.py`, remove every legacy flat-file artifact (D-04), delete the legacy sync tests + sync TestClient fixture, and verify the walking skeleton end-to-end against a live Docker stack. + +Purpose: This plan closes the loop. After it ships, ROADMAP.md Phase 1 success criteria #1, #3, and #4 are all satisfied (criterion #2 was satisfied in Plan 03). Phase 1 ends with a usable single-user app whose entire internal architecture is the multi-user-ready PostgreSQL + MinIO + Celery stack. + +Output: Updated `backend/main.py`, `backend/api/documents.py`, `backend/api/topics.py`, `backend/services/classifier.py`; new `backend/celery_app.py` + `backend/tasks/document_tasks.py`; cleaned `backend/config.py`; final test-suite cutover; deletion of `data/` directory; and a passing end-to-end walking-skeleton verification checkpoint. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@CLAUDE.md +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/01-infrastructure-foundation/01-CONTEXT.md +@.planning/phases/01-infrastructure-foundation/01-RESEARCH.md +@.planning/phases/01-infrastructure-foundation/01-PATTERNS.md +@.planning/phases/01-infrastructure-foundation/SKELETON.md +@.planning/phases/01-infrastructure-foundation/01-04-SUMMARY.md +@backend/services/storage.py +@backend/db/models.py +@backend/db/session.py +@backend/deps/db.py +@backend/storage/__init__.py + + +After Plan 04, the async storage layer is in place. This plan wires consumers. + +Existing `api/documents.py` consumer points (must be ported to async + session injection): +- `storage.save_upload(content, file.filename, mime)` → `await storage.save_upload(session, content, file.filename, mime)` +- `storage.save_metadata(meta)` → `await storage.save_metadata(session, meta)` +- `storage.list_metadata(topic=topic)` → `await storage.list_metadata(session, topic=topic)` +- `storage.get_metadata(doc_id)` → `await storage.get_metadata(session, doc_id)` +- `storage.delete_document(doc_id)` → `await storage.delete_document(session, doc_id)` +- `await classifier.classify_document(saved["id"])` → `extract_and_classify.delay(saved["id"])` (Celery task — STORE-08) + +Existing `api/topics.py` consumer points: +- `storage.load_topics()` → `await storage.load_topics(session)` +- `storage.topic_doc_counts()` → `await storage.topic_doc_counts(session)` +- `storage.create_topic(...)` → `await storage.create_topic(session, ...)` +- `storage.update_topic(...)` → `await storage.update_topic(session, ...)` +- `storage.delete_topic(...)` → `await storage.delete_topic(session, ...)` +- `storage.get_metadata(...)` → `await storage.get_metadata(session, ...)` (used by `/suggest`) + +Existing `services/classifier.py` consumer points (called by both the soon-removed inline upload path and the new Celery task; module signature changes from `async def classify_document(doc_id)` accepting no session to `async def classify_document(session, doc_id)`): +- Used inside the Celery task wrapper via `asyncio.run(classify_document(session, doc_id))` after manually opening a session + +`api/settings.py` — KEEP AS-IS. The `settings.json` flat file lives until Phase 2 (D-03 settings deferred); the `services/storage.load_settings()` / `save_settings()` functions remain sync per Plan 04. + +`main.py` lifespan contract (current → new): +```python +# current +async def lifespan(app): + ensure_data_dirs() + yield + +# new +async def lifespan(app): + # MinIO bucket auto-create + minio_client = Minio(settings.minio_endpoint, access_key=..., secret_key=..., secure=False) + if not await asyncio.to_thread(minio_client.bucket_exists, settings.minio_bucket): + await asyncio.to_thread(minio_client.make_bucket, settings.minio_bucket) + app.state.minio = minio_client + yield + await engine.dispose() +``` + +`/health` response contract (D-07): +```json +{ + "status": "ok", + "checks": {"postgres": "ok", "minio": "ok"} +} +``` +Or `"status": "degraded"` if any check is not `"ok"`. + + + + + + + Task 1: Introduce backend/celery_app.py + backend/tasks/document_tasks.py and update services/classifier.py + backend/celery_app.py, backend/tasks/__init__.py, backend/tasks/document_tasks.py, backend/services/classifier.py + + - `from celery_app import celery_app` imports the configured Celery instance + - `celery_app.conf.broker_url` and `celery_app.conf.result_backend` both read from `REDIS_URL` env var (falling back to `redis://redis:6379/0` if unset) + - `celery_app.conf.task_serializer == "json"` and `celery_app.conf.accept_content == ["json"]` + - `celery_app.conf.task_routes` routes `tasks.document_tasks.*` to the `documents` queue + - `tasks.document_tasks.extract_and_classify(document_id: str)` is a plain `def` (NOT `async def`) decorated with `@celery_app.task(name="tasks.document_tasks.extract_and_classify")` + - The task opens a fresh `AsyncSession` via `asyncio.run(...)` around the async body, calls `services.extractor.extract_text(...)` on the bytes pulled from MinIO via `MinIOBackend.get_object`, persists the extracted text via `services.storage.save_metadata`, then calls `services.classifier.classify_document(session, doc_id)` and persists the result + - Failures in classification do not raise — they update the document's `status` to `"classification_failed"` and store the error string in a `classification_error` field on the returned dict (parity with the existing non-fatal-classification pattern in `api/documents.py`) + - `services/classifier.py` is updated to accept a `session: AsyncSession` as its first arg; the previous `storage.get_metadata(doc_id)` becomes `await storage.get_metadata(session, doc_id)`; same pattern for `storage.load_settings()` (still sync, no change), `storage.load_topics(session)`, `storage.create_topic(session, name)`, `storage.update_document_topics(session, doc_id, topics)` + - `tasks/__init__.py` exists (empty file is acceptable) so `tasks/` is recognized as a package + + + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 5 — Celery + Redis configuration; Pitfall 7 — keep celery_app.py minimal to avoid circular imports; Anti-Pattern: do not use async def for Celery task functions) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/celery_app.py + backend/tasks/document_tasks.py sections) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-08 Celery+Redis; D-10 celery-worker service exists per Plan 01) + - backend/services/classifier.py (read in full — every `storage.*` call site needs an `await ... (session, ...)` rewrite) + - backend/services/extractor.py (read once — verify which function name is used, then call it from the Celery task; do NOT modify this file) + - backend/db/session.py (Plan 03 output — confirm `AsyncSessionLocal` is the exported symbol) + - backend/services/storage.py (Plan 04 output — confirm async function signatures the task will call) + + + Create `backend/celery_app.py` with minimal imports per Pitfall 7: `import os`, `from celery import Celery`. Instantiate `celery_app = Celery("docuvault")`. Configure: `celery_app.conf.broker_url = os.environ.get("REDIS_URL", "redis://redis:6379/0")`, `celery_app.conf.result_backend = os.environ.get("REDIS_URL", "redis://redis:6379/0")`, `celery_app.conf.task_serializer = "json"`, `celery_app.conf.result_serializer = "json"`, `celery_app.conf.accept_content = ["json"]`, `celery_app.conf.task_routes = {"tasks.document_tasks.*": {"queue": "documents"}}`. Then `celery_app.autodiscover_tasks(["tasks"], force=True)` so registering tasks under `tasks/` works without an explicit import. Critically, DO NOT import `from config import settings` here — `config.py` triggers Pydantic Settings env-loading that may pull in FastAPI-related side effects in some setups. Read REDIS_URL directly from `os.environ`. + + Create `backend/tasks/__init__.py` as an empty file. + + Create `backend/tasks/document_tasks.py`. Imports: `import asyncio`, `from celery_app import celery_app`, `from db.session import AsyncSessionLocal`, `from services import storage, extractor, classifier`, `from storage import get_storage_backend`. Define the task: + + ```python + @celery_app.task(name="tasks.document_tasks.extract_and_classify") + def extract_and_classify(document_id: str) -> dict: + return asyncio.run(_run(document_id)) + + async def _run(document_id: str) -> dict: + async with AsyncSessionLocal() as session: + meta = await storage.get_metadata(session, document_id) + if meta is None: + return {"document_id": document_id, "status": "not_found"} + # Fetch the bytes from MinIO so the extractor can read them + backend = get_storage_backend() + try: + obj_key = meta.get("object_key") or meta.get("path") + # The object_key shape is {user_id}/{doc_id}/{uuid4}{ext} — retrieve via storage_backend + # We don't have object_key on the metadata dict in v1 — read from DB directly: + from db.models import Document + import uuid as _uuid + doc = await session.get(Document, _uuid.UUID(document_id)) + if doc is None or not doc.object_key: + return {"document_id": document_id, "status": "missing_object"} + file_bytes = await backend.get_object(doc.object_key) + text = extractor.extract_text_from_bytes(file_bytes, doc.content_type) if hasattr(extractor, "extract_text_from_bytes") else extractor.extract_text_bytes(file_bytes, doc.content_type) + meta["extracted_text"] = text + await storage.save_metadata(session, meta) + except Exception as e: + return {"document_id": document_id, "status": "extract_failed", "error": str(e)} + try: + topics = await classifier.classify_document(session, document_id) + return {"document_id": document_id, "status": "classified", "topics": topics} + except Exception as e: + # Non-fatal — preserve the existing convention from api/documents.py line 54-56 + doc.status = "classification_failed" + await session.commit() + return {"document_id": document_id, "status": "classification_failed", "error": str(e)} + ``` + + Note: If `services/extractor.py` only exposes `extract_text(path, mime)` (file-path-based), add a new helper `extract_text_from_bytes(file_bytes: bytes, mime: str)` to `services/extractor.py` that writes `file_bytes` to a `tempfile.NamedTemporaryFile(suffix=...)`, calls the existing `extract_text(tmp.name, mime)`, and unlinks the temp file. Do not modify any other behavior in `services/extractor.py`. + + Update `backend/services/classifier.py`: change `async def classify_document(doc_id: str, topic_names: list[str] | None = None)` to `async def classify_document(session: AsyncSession, doc_id: str, topic_names: list[str] | None = None)`. Add `from sqlalchemy.ext.asyncio import AsyncSession` at the top. Replace `storage.get_metadata(doc_id)` → `await storage.get_metadata(session, doc_id)`. Replace `storage.load_settings()` → `storage.load_settings()` (unchanged — Phase 1 keeps the flat file; this is sync). Replace `storage.load_topics()` → `await storage.load_topics(session)` (note signature change — adapter call). Replace `storage.create_topic(name.strip())` → `await storage.create_topic(session, name.strip())`. Replace `storage.update_document_topics(doc_id, final_topics)` → `await storage.update_document_topics(session, doc_id, final_topics)`. Apply the same session-injection treatment to `suggest_topics_for_document(session, doc_id)`. Preserve `MAX_AI_CHARS = 8_000` and every other line verbatim. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c " +import os +os.environ.setdefault('REDIS_URL', 'redis://localhost:6379/0') +from celery_app import celery_app +assert celery_app.conf.task_serializer == 'json' +assert celery_app.conf.accept_content == ['json'] +assert 'tasks.document_tasks.*' in celery_app.conf.task_routes +assert celery_app.conf.task_routes['tasks.document_tasks.*'] == {'queue': 'documents'} +from tasks.document_tasks import extract_and_classify +import inspect +assert not inspect.iscoroutinefunction(extract_and_classify), 'Celery task must be sync def (not async def)' +# Verify the task is registered with Celery +registered = celery_app.tasks +assert 'tasks.document_tasks.extract_and_classify' in registered, f'task not registered; have: {list(registered.keys())[-5:]}' +import services.classifier as cl +sig = inspect.signature(cl.classify_document) +assert list(sig.parameters.keys())[0] == 'session', f'classify_document first param should be session, got: {list(sig.parameters.keys())}' +print('celery-task-ok') +" + + + - `backend/celery_app.py` exists and contains `celery_app = Celery("docuvault")` + - `backend/celery_app.py` does NOT import from `config` (Pitfall 7 — verifiable: `grep -c "from config\|import config" backend/celery_app.py | grep -q "^0$"`) + - `backend/celery_app.py` contains `task_routes = {"tasks.document_tasks.*": {"queue": "documents"}}` + - `backend/tasks/__init__.py` exists + - `backend/tasks/document_tasks.py` contains `@celery_app.task(name="tasks.document_tasks.extract_and_classify")` + - `backend/tasks/document_tasks.py` defines `def extract_and_classify` as a sync `def` (NOT `async def`) — verifiable via the inline `inspect.iscoroutinefunction` assertion + - `backend/tasks/document_tasks.py` uses `asyncio.run` to invoke the async body (verifiable: `grep -c "asyncio.run" backend/tasks/document_tasks.py >= 1`) + - `backend/services/classifier.py` first parameter of `classify_document` is `session` (verified by the inline signature inspection) + - `backend/services/classifier.py` calls `await storage.get_metadata(session, doc_id)` and `await storage.update_document_topics(session, doc_id, ...)` + - `services/extractor.py` either already exposes a bytes-based extraction function OR a new `extract_text_from_bytes` helper is added; in either case the Celery task can import and call it without raising on import — verifiable via `python3 -c "from services import extractor; assert hasattr(extractor, 'extract_text_from_bytes') or hasattr(extractor, 'extract_text_bytes') or hasattr(extractor, 'extract_text')"` exits 0 + - The Verify command prints `celery-task-ok` + + Celery is wired with a Redis-backed broker; the `extract_and_classify` task is registered and discoverable; `services/classifier.py` is session-aware; the Phase 1 background worker contract is in place. + + + + Task 2: Wire backend/main.py lifespan + /health, rewrite backend/api/documents.py and backend/api/topics.py to async session injection + backend/main.py, backend/api/documents.py, backend/api/topics.py + + - `GET /health` returns HTTP 200 with body `{"status": "ok", "checks": {"postgres": "ok", "minio": "ok"}}` when both services are healthy + - `GET /health` returns HTTP 200 with body `{"status": "degraded", "checks": {"postgres": "error: ...", "minio": "ok"}}` (or analogous shape) when one service is unreachable — `/health` never returns 5xx + - `POST /api/documents/upload` calls `await storage.save_upload(session, ...)` then `extract_and_classify.delay(str(saved["id"]))` if `auto_classify` is true; the response shape preserves `{"id", "original_name", "filename", "mime_type", "size_bytes", "extracted_text", "topics", "created_at", "classified_at"}` so the frontend continues to work + - `GET /api/documents` calls `await storage.list_metadata(session, topic=topic)` and paginates the result + - `GET /api/documents/{doc_id}` and `DELETE /api/documents/{doc_id}` use the session dependency + - `POST /api/documents/{doc_id}/classify` injects the session and either calls the Celery task with `.delay(...)` or `await classifier.classify_document(session, doc_id, topic_names)` synchronously and returns the result; choose the synchronous in-route call for this endpoint because it has historically returned the topic list (preserve behavior — Phase 4 may change this) + - `GET /api/topics`, `POST /api/topics`, `PATCH /api/topics/{topic_id}`, `DELETE /api/topics/{topic_id}`, `POST /api/topics/suggest` all inject the session dependency + - `backend/main.py` lifespan creates the MinIO client, auto-creates the `docuvault` bucket if absent, stores it on `app.state.minio`, and disposes `engine` on shutdown + - `backend/main.py` no longer calls `ensure_data_dirs()` (legacy) + + + - backend/main.py (current 34-line file — preserve `app = FastAPI(...)`, `app.add_middleware(CORSMiddleware, ...)`, `app.include_router(...)` calls; replace only the lifespan body and the `/health` handler) + - backend/api/documents.py (current 102 lines — read every route handler; preserve every `HTTPException` message and the `ALLOWED_MIME_TYPES` set verbatim) + - backend/api/topics.py (current 73 lines — read every route handler; preserve Pydantic models `TopicCreate`, `TopicUpdate`, `SuggestRequest` verbatim) + - backend/services/storage.py (Plan 04 output — async function signatures) + - backend/db/session.py (Plan 03 output — `AsyncSessionLocal`, `engine`) + - backend/deps/db.py (Plan 03 output — `get_db`) + - backend/tasks/document_tasks.py (Task 1 output — `extract_and_classify`) + - .planning/phases/01-infrastructure-foundation/01-PATTERNS.md (backend/main.py + backend/api/documents.py + backend/api/topics.py sections) + - .planning/phases/01-infrastructure-foundation/01-RESEARCH.md (Pattern 4 — MinIO bucket initialization at startup) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-07 /health extended) + + + Rewrite `backend/main.py`. Imports: keep `from contextlib import asynccontextmanager`, `from fastapi import FastAPI, Request`, `from fastapi.middleware.cors import CORSMiddleware`, `from api.documents import router as documents_router`, `from api.topics import router as topics_router`, `from api.settings import router as settings_router`. Add `import asyncio`, `from minio import Minio`, `from sqlalchemy import text`, `from db.session import engine, AsyncSessionLocal`, `from config import settings`. DO NOT import `ensure_data_dirs`. + + Rewrite the lifespan function: + ```python + @asynccontextmanager + async def lifespan(app: FastAPI): + # MinIO bucket initialization (RESEARCH.md Pattern 4) + minio_client = Minio( + settings.minio_endpoint, + access_key=settings.minio_access_key, + secret_key=settings.minio_secret_key, + secure=False, + ) + exists = await asyncio.to_thread(minio_client.bucket_exists, settings.minio_bucket) + if not exists: + await asyncio.to_thread(minio_client.make_bucket, settings.minio_bucket) + app.state.minio = minio_client + yield + await engine.dispose() + ``` + + Rewrite the `/health` handler (preserving `@app.get("/health")` and `async def`): + ```python + @app.get("/health") + async def health(request: Request): + checks = {} + try: + async with AsyncSessionLocal() as session: + await session.execute(text("SELECT 1")) + checks["postgres"] = "ok" + except Exception as e: + checks["postgres"] = f"error: {type(e).__name__}: {e}" + try: + ok = await asyncio.to_thread(request.app.state.minio.bucket_exists, settings.minio_bucket) + checks["minio"] = "ok" if ok else "error: bucket missing" + except Exception as e: + checks["minio"] = f"error: {type(e).__name__}: {e}" + status = "ok" if all(v == "ok" for v in checks.values()) else "degraded" + return {"status": status, "checks": checks} + ``` + + Keep the existing CORS middleware (`allow_origins=["*"]` for Phase 1 — Phase 2 locks down). + + Rewrite `backend/api/documents.py`. Imports: replace `from services import storage, extractor, classifier` with `from sqlalchemy.ext.asyncio import AsyncSession`, `from deps.db import get_db`, `from services import storage, extractor`, `from tasks.document_tasks import extract_and_classify`, `from services import classifier` (only used by the `/classify` endpoint), and keep `from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query, Depends` (add `Depends`). Preserve the router definition and `ALLOWED_MIME_TYPES` set. + + For every route, append `session: AsyncSession = Depends(get_db)` to the signature. For each storage call, prepend `await` and pass `session` as the first arg: + - `upload_document`: read content, validate empty, generate filename + mime, call `await storage.save_upload(session, content, file.filename or "upload", mime)`, then `text = extractor.extract_text(saved["path"], mime)` — wait: `saved["path"]` is now the object_key, not a filesystem path. CHANGE the extraction step: pull bytes already in memory (`content`) and call the new `extractor.extract_text_from_bytes(content, mime)` helper introduced in Task 1. Build the `meta` dict exactly as before (preserve the keys), call `await storage.save_metadata(session, meta)`. If `auto_classify` is True, call `extract_and_classify.delay(saved["id"])` (NOTE: this re-fetches from MinIO inside the worker — acceptable for Phase 1; Phase 3 will pass the bytes through Redis directly if perf demands). Return `meta`. CHANGE: since classification is now async via Celery, the response no longer includes `topics` populated by the inline classifier call — set `meta["topics"] = []` and `meta["classified_at"] = None` and rely on the worker to update the DB row. Document this in a comment as the Phase 1 cutover behavior. + - `list_documents`: `docs = await storage.list_metadata(session, topic=topic)` then preserve the existing pagination math. + - `get_document`: `meta = await storage.get_metadata(session, doc_id)`; raise 404 if `None`. + - `delete_document`: `ok = await storage.delete_document(session, doc_id)`; raise 404 if `False`. + - `classify_document` (the `/classify` route): `meta = await storage.get_metadata(session, doc_id)`; raise 404 if None; preserve the inline `await classifier.classify_document(session, doc_id, topic_names)` call (this endpoint historically returned the topic list synchronously — keep that behavior; the upload-time path is the async one). + + Rewrite `backend/api/topics.py` similarly: add `Depends` import, `from sqlalchemy.ext.asyncio import AsyncSession`, `from deps.db import get_db`. Add `session: AsyncSession = Depends(get_db)` to every route. Wrap every `storage.*` call with `await` and prepend `session`: + - `list_topics`: `topics = await storage.load_topics(session)`, `counts = await storage.topic_doc_counts(session)`. + - `create_topic`: `topic = await storage.create_topic(session, body.name, body.description, body.color)`. + - `update_topic`: `topic = await storage.update_topic(session, topic_id, name=body.name, description=body.description, color=body.color)`. + - `delete_topic`: `name = await storage.delete_topic(session, topic_id)`. + - `suggest_topics`: `meta = await storage.get_metadata(session, body.document_id)`; if None, 404; then `await classifier.suggest_topics_for_document(session, body.document_id)`. + + Preserve every Pydantic model and HTTPException message verbatim. + + + cd /Users/nik/Documents/Progamming/document_scanner/backend && python3 -c " +import os +os.environ.setdefault('REDIS_URL', 'redis://localhost:6379/0') +os.environ.setdefault('DATABASE_URL', 'postgresql+psycopg://docuvault_app:changeme_app@localhost:5432/docuvault') +import inspect +from main import app +# Confirm /health route exists with new shape +routes = {r.path for r in app.routes} +assert '/health' in routes, 'health route missing' +# Confirm session injection on documents and topics +from api.documents import upload_document, list_documents, get_document, delete_document +from api.topics import list_topics, create_topic, update_topic, delete_topic +for fn in [upload_document, list_documents, get_document, delete_document, list_topics, create_topic, update_topic, delete_topic]: + sig = inspect.signature(fn) + params = list(sig.parameters) + assert 'session' in params, f'{fn.__name__} missing session param: {params}' +print('routes-wired-ok') +" + + + - `backend/main.py` no longer contains `ensure_data_dirs` (verifiable: `grep -c "ensure_data_dirs" backend/main.py | grep -q "^0$"`) + - `backend/main.py` contains `from minio import Minio`, `from db.session import engine, AsyncSessionLocal`, `from config import settings` + - `backend/main.py` lifespan contains `app.state.minio = minio_client` + - `backend/main.py` lifespan contains `await engine.dispose()` + - `backend/main.py` `/health` contains both `SELECT 1` and `bucket_exists` probes + - `backend/main.py` `/health` returns the shape `{"status": ..., "checks": {"postgres": ..., "minio": ...}}` (verifiable by inspecting the return statement and by Task 3 live test) + - `backend/api/documents.py` contains `from deps.db import get_db` and `from tasks.document_tasks import extract_and_classify` + - `backend/api/documents.py` upload handler contains `extract_and_classify.delay(` (Celery enqueue) — verifiable via `grep -c "extract_and_classify.delay" backend/api/documents.py >= 1` + - `backend/api/documents.py` upload handler no longer contains `await classifier.classify_document` in the upload path (the only remaining classifier call is on the `/classify` endpoint) — verifiable via `grep -c "await classifier.classify_document" backend/api/documents.py | grep -q "^1$"` + - Every route in `backend/api/documents.py` and `backend/api/topics.py` contains `session: AsyncSession = Depends(get_db)` (verifiable via `grep -c "session: AsyncSession = Depends(get_db)" backend/api/documents.py >= 5` and similarly `>= 5` for topics.py) + - The Verify command prints `routes-wired-ok` + - `cd backend && python3 -m pytest tests/ -v --collect-only` exits 0 (collection succeeds — no import errors) + + FastAPI lifespan creates the MinIO bucket and disposes the engine; `/health` probes both services; all document and topic routes use async session injection; upload-time classification is queued via Celery; the `/classify` endpoint remains synchronous for compatibility. + + + + Task 3: Final cutover — delete legacy data/, prune config.py, prune conftest.py + test_documents.py legacy fixtures and sync tests, unfail async ports + backend/config.py, backend/tests/conftest.py, backend/tests/test_documents.py, backend/tests/test_health.py, backend/data + + - `backend/data/` directory and all its contents are deleted (D-04). The git repo no longer tracks this directory. + - `backend/config.py` no longer declares `DATA_DIR`, `UPLOADS_DIR`, `METADATA_DIR`, `TOPICS_FILE`, `ensure_data_dirs` (legacy flat-file constants). It retains `DEFAULT_SETTINGS`, `DEFAULT_SYSTEM_PROMPT`, the Pydantic `Settings` class, and the module-level `settings = Settings()`. `SETTINGS_FILE` is RETAINED (still used for the Phase-2-deferred settings JSON file path) but its value is rebased onto the new `settings.data_dir` field rather than a removed module-level constant. + - `backend/tests/conftest.py` no longer defines the autouse `isolated_data_dir` fixture (the flat-file scaffold). It no longer defines the sync `client` fixture (which built a `TestClient`). The only fixtures remaining are `db_session`, `async_client`, `sample_txt`, `sample_pdf`. Tests are rewired to use `async_client` everywhere. + - `backend/tests/test_documents.py` no longer contains any of the legacy sync test functions (`test_upload_txt_no_classify`, `test_upload_pdf_no_classify`, `test_list_documents`, `test_list_documents_filter_by_topic`, `test_get_document`, `test_get_document_not_found`, `test_delete_document`, `test_delete_document_not_found`, `test_upload_empty_file`) — they are DELETED. The `_async`-suffixed tests from Plan 02 have their `@pytest.mark.xfail` markers REMOVED and now run as live tests. + - `backend/tests/test_health.py` has the `@pytest.mark.xfail` marker removed from `test_health_checks_postgres_and_minio`; the old sync `test_health(client)` is REPLACED with `test_health_status_ok_sync_deprecated` that is `@pytest.mark.skip(reason="legacy sync client removed in plan 05")` OR deleted entirely. Choose deletion — cleaner. + - The full pytest run reports zero XFAIL/SKIPPED for Phase 1 tests except where the underlying service (live PostgreSQL/MinIO/Redis) is unavailable in the test env (in which case the integration tests are skipped via a fixture that probes for service availability — see action below). + + + - backend/config.py (Plan 01 output — identify exactly which legacy constants must be removed) + - backend/tests/conftest.py (Plan 02 output — verify which fixtures need to go) + - backend/tests/test_documents.py (Plan 02 output — verify which test functions to delete vs keep) + - backend/tests/test_health.py (Plan 02 output — same scrutiny) + - backend/services/storage.py (Plan 04 output — confirm it still imports `SETTINGS_FILE` from config; we will preserve `SETTINGS_FILE` as a derived path) + - backend/api/settings.py (read — uses `services.storage.load_settings/save_settings` which depend on `SETTINGS_FILE`) + - .planning/phases/01-infrastructure-foundation/01-CONTEXT.md (D-04 delete `data/` contents) + + + Step 1 — delete legacy data: run `git rm -rf backend/data/` (if tracked) and `rm -rf backend/data/`. Add `backend/data/` to `.gitignore`. If `services/storage.py` or any other file still references `UPLOADS_DIR`/`METADATA_DIR`/`TOPICS_FILE`, fix those references first (per Plan 04 they should already be gone for documents/topics; `SETTINGS_FILE` is the only remaining legacy path). + + Step 2 — prune `backend/config.py`. Read the current file (post-Plan 01). Remove: + - `DATA_DIR = Path(os.environ.get("DATA_DIR", "/app/data"))` + - `UPLOADS_DIR = DATA_DIR / "uploads"` + - `METADATA_DIR = DATA_DIR / "metadata"` + - `TOPICS_FILE = DATA_DIR / "topics.json"` + - the `def ensure_data_dirs():` function + - the `import json` at the top if no longer used (keep if `DEFAULT_SETTINGS` interpolates from JSON anywhere — currently no) + - the `import os` (no longer needed since `Settings` reads env via Pydantic) + Preserve: + - `from pathlib import Path` (used by `SETTINGS_FILE`) + - `DEFAULT_SYSTEM_PROMPT` (used by `services/classifier.py`) + - `DEFAULT_SETTINGS` (used by `services/storage.load_settings` fallback) + - `class Settings(BaseSettings):` with all Phase 1 fields (Plan 01 output) + - `settings = Settings()` module instance + Rebase `SETTINGS_FILE` as a derived path computed from `settings.data_dir`: + `SETTINGS_FILE = Path(settings.data_dir) / "settings.json"` — placed AFTER the `settings = Settings()` line. Add a comment: `# SETTINGS_FILE: still flat-file in Phase 1; migrates to users.ai_provider in Phase 2`. + + Step 3 — prune `backend/tests/conftest.py`: + - DELETE the entire `isolated_data_dir` fixture (the autouse one that monkey-patches `config.DATA_DIR` etc.) + - DELETE the sync `client` fixture (`with TestClient(app) as c: yield c`) + - KEEP the `db_session`, `async_client`, `sample_txt`, `sample_pdf` fixtures introduced in Plan 02. Promote `async_client` so the previous behavior — fail gracefully if `deps.db` does not exist — is replaced with a hard dependency: remove the `try/except ImportError: pytest.skip(...)` wrapper inside the async fixtures because `deps.db.get_db` now exists. + - ADD a new `pytest_asyncio.fixture(scope="session")` named `live_services_available` that probes localhost:5432, localhost:9000, localhost:6379 via `socket.create_connection(..., timeout=1)`; if any probe fails, the fixture yields `False`; otherwise `True`. Update the `async_client` fixture (or add a new `live_async_client` fixture) to use an actual PostgreSQL + MinIO when `live_services_available` is True, falling back to the in-memory aiosqlite engine when False. Use `pytest.mark.skipif(not live_services_available, reason="docker compose not running")` on integration tests that need real MinIO; unit tests using only the in-memory DB do not need the skip marker. (Simpler approach acceptable: detect via env var `INTEGRATION=1`; if unset, skip integration tests.) + + Step 4 — prune `backend/tests/test_documents.py`: + - DELETE the legacy sync tests: `test_upload_txt_no_classify`, `test_upload_pdf_no_classify`, `test_list_documents`, `test_list_documents_filter_by_topic`, `test_get_document`, `test_get_document_not_found`, `test_delete_document`, `test_delete_document_not_found`, `test_upload_empty_file`. (Plan 02 left them in place during the cutover; this plan completes the deletion.) + - On every `_async`-suffixed test added in Plan 02, REMOVE the `@pytest.mark.xfail(strict=False, reason="async storage layer implemented in plan 05")` marker. + - Update any test that previously referenced `import services.storage as st; st.update_document_topics(...)` to use the async ORM API via the `db_session` fixture: `from db.models import Document, DocumentTopic; from sqlalchemy import insert; ...`. For tests that need a topic-tagged document, build it via the API itself (call `POST /api/topics` then `PATCH /api/documents/.../classify`). + + Step 5 — prune `backend/tests/test_health.py`: + - DELETE the legacy `def test_health(client):` (it used the sync TestClient fixture which is gone). + - REMOVE the `@pytest.mark.xfail` marker from `test_health_checks_postgres_and_minio`. + - If `live_services_available` is False, this test should be skipped via `pytest.mark.skipif(...)`. + + Step 6 — run the full suite end-to-end against the in-memory engine: `cd backend && python3 -m pytest tests/ -v` should exit 0 with the storage tests PASSED, the alembic tests PASSED (or SKIPPED if no PostgreSQL available — the in-memory aiosqlite test path covers them), the health test SKIPPED or PASSED, and the async document tests PASSED or SKIPPED depending on `live_services_available`. + + + cd /Users/nik/Documents/Progamming/document_scanner && [ ! -d backend/data ] && echo "data-dir-deleted" && grep -c "DATA_DIR\|UPLOADS_DIR\|METADATA_DIR\|TOPICS_FILE\|ensure_data_dirs" backend/config.py | awk '{exit ($1 == 0) ? 0 : 1}' && echo "config-pruned" && cd backend && python3 -m pytest tests/ -v 2>&1 | tail -20 + + + - `backend/data/` directory does not exist (verifiable: `[ ! -d backend/data ]` exits 0) + - `.gitignore` contains `backend/data/` (verifiable: `grep -Fx "backend/data/" .gitignore` exits 0) + - `backend/config.py` no longer mentions `DATA_DIR`, `UPLOADS_DIR`, `METADATA_DIR`, `TOPICS_FILE`, `ensure_data_dirs` (verifiable via the Verify command's grep-c check) + - `backend/config.py` still defines `DEFAULT_SETTINGS`, `DEFAULT_SYSTEM_PROMPT`, `class Settings(BaseSettings)`, `settings = Settings()`, and `SETTINGS_FILE = Path(...) / "settings.json"` + - `backend/tests/conftest.py` no longer defines `isolated_data_dir` (verifiable: `grep -c "def isolated_data_dir" backend/tests/conftest.py | grep -q "^0$"`) + - `backend/tests/conftest.py` no longer defines a sync `client` fixture using `TestClient` (verifiable: `grep -c "TestClient" backend/tests/conftest.py | grep -q "^0$"`) + - `backend/tests/test_documents.py` no longer contains the legacy sync test names (verifiable: `grep -cE "^def test_(upload_txt_no_classify|upload_pdf_no_classify|list_documents|get_document|delete_document|upload_empty_file)\b" backend/tests/test_documents.py | grep -q "^0$"`) + - `backend/tests/test_documents.py` no longer contains `@pytest.mark.xfail` markers (the cutover removes them — verifiable: `grep -c "@pytest.mark.xfail" backend/tests/test_documents.py | grep -q "^0$"`) + - `backend/tests/test_health.py` no longer contains `@pytest.mark.xfail` (verifiable: `grep -c "@pytest.mark.xfail" backend/tests/test_health.py | grep -q "^0$"`) + - `cd backend && python3 -m pytest tests/ -v 2>&1` shows 0 FAILED and 0 ERROR lines (verifiable: `python3 -m pytest tests/ 2>&1 | grep -E "^FAILED|^ERROR" | wc -l | grep -q "^0$"`) + - The Verify command output shows `data-dir-deleted` and `config-pruned` + + The Phase 1 cutover is complete: no flat-file artifacts remain in code or on disk; the test suite uses only async fixtures; the legacy tests have been deleted; the async ports of every legacy test run as first-class tests. + + + + Task 4: End-to-end walking-skeleton verification — docker compose up + real PDF upload + Celery worker + (verification only) + + - .planning/phases/01-infrastructure-foundation/SKELETON.md (the success contract for this checkpoint) + - .planning/phases/01-infrastructure-foundation/01-03-SUMMARY.md (the Alembic upgrade output from Plan 03) + - .planning/phases/01-infrastructure-foundation/01-04-SUMMARY.md (the storage rewrite summary from Plan 04) + + + Plans 01-05 together: a fully wired DocuVault backend running on Docker Compose with PostgreSQL + MinIO + Redis + Celery + FastAPI. This checkpoint verifies the walking-skeleton end-to-end: a real document upload via the rewritten API persists metadata to PostgreSQL, stores bytes in MinIO with a UUID-based object key, enqueues extraction + classification on Redis, and the Celery worker processes the task. `GET /health` returns `postgres: ok` and `minio: ok`. ROADMAP.md Phase 1 success criteria #1, #3, and #4 are verified live here (#2 was verified in Plan 03). + + + From the project root: + + 1. Ensure `.env` exists with all variables from `.env.example` filled in: `cp .env.example .env` (if not present) and replace each `changeme_*` placeholder with a value of your choice. The DATABASE_URL/DATABASE_MIGRATE_URL passwords MUST match the hardcoded passwords in `docker/postgres/initdb.d/01-init-users.sql` from Plan 01 (which itself was committed during Wave 1). The REDIS_URL password must match REDIS_PASSWORD. + + 2. Tear down any prior state: `docker compose down -v` (the `-v` deletes the postgres_data and minio_data named volumes so the init script will re-run). + + 3. Boot everything: `docker compose up --build -d`. Wait ~30 seconds. + + 4. Verify all services are healthy: `docker compose ps`. The `STATUS` column must show `Up (healthy)` for `postgres`, `minio`, `redis`, `backend`, AND `celery-worker`. If any is `unhealthy`, capture `docker compose logs ` and resolve before continuing. + + 5. Apply the migration against the live DB: `docker compose exec backend bash -lc "cd /app && alembic upgrade head"`. Must exit 0 with `Running upgrade -> 0001`. + + 6. Hit the health endpoint: `curl -s http://localhost:8000/health | python3 -m json.tool`. The response MUST be: + ``` + { + "status": "ok", + "checks": { + "postgres": "ok", + "minio": "ok" + } + } + ``` + + 7. Upload a real PDF or text file. Pick any small PDF (or use `printf 'Test document about invoices and contracts.' > /tmp/test.txt` first). Then: + ``` + curl -s -X POST http://localhost:8000/api/documents/upload \ + -F "file=@/tmp/test.txt;type=text/plain" \ + -F "auto_classify=true" | python3 -m json.tool + ``` + Confirm the response includes: + - `"id"` — a 36-character UUID string + - `"original_name": "test.txt"` + - `"size_bytes"` matching the file size + - `"topics": []` (classification is async — the Celery worker fills this in seconds later) + + 8. Confirm the document landed in PostgreSQL: + `docker compose exec postgres psql -U docuvault_app -d docuvault -c "SELECT id, filename, object_key, status FROM documents ORDER BY created_at DESC LIMIT 1;"` + — exactly one row; `object_key` starts with `null-user/` (D-03 sentinel from Plan 04); `status` is `pending` initially then `classified` or `classification_failed` after the worker runs. + + 9. Confirm the document landed in MinIO. The object key from step 8 will look like `null-user//.txt`. Either use the MinIO web console at `http://localhost:9001` (login with `MINIO_ROOT_USER` / `MINIO_ROOT_PASSWORD` from `.env`) and navigate to `docuvault` bucket → confirm the object exists with non-zero size — OR use `mc`: + `docker compose exec minio mc alias set local http://localhost:9000 $MINIO_ROOT_USER $MINIO_ROOT_PASSWORD` then `docker compose exec minio mc ls local/docuvault/null-user/`. + + 10. Confirm the Celery worker processed the task: + `docker compose logs celery-worker | tail -30` + — look for a `Task tasks.document_tasks.extract_and_classify[...] received` line followed by `succeeded` or a structured error. If the task succeeded, run: + `curl -s http://localhost:8000/api/documents | python3 -m json.tool` + — the response should show one item with `extracted_text` populated and possibly `topics` populated by the AI classifier (depending on AI provider config; if no `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` is set, classification will fail gracefully and `status` will be `classification_failed` — that is acceptable for this walking-skeleton check; the storage layer worked.). + + 11. Delete the document: + `curl -s -X DELETE http://localhost:8000/api/documents/` returns `{"success": true}`. + Then confirm the MinIO object is gone: `docker compose exec minio mc ls local/docuvault/null-user//` returns empty or "Object does not exist". + + 12. Run the test suite against the live stack: + `docker compose exec -e INTEGRATION=1 backend bash -lc "cd /app && pytest tests/ -v"` + — every test PASSED, zero FAILED, zero XFAIL (skips for integration tests when INTEGRATION=0 are acceptable on a host-only run; when INTEGRATION=1 inside the container with live services, they must run and pass). + + + All 12 verification steps succeed. The walking skeleton is live: PDF → API → PostgreSQL + MinIO + Celery → extracted text → classification → DB row. ROADMAP.md Phase 1 is complete. + + + Common failures and fixes: + - `/health` reports `postgres: error: ConnectionRefusedError`: postgres healthcheck didn't gate startup; check `depends_on: condition: service_healthy` is set on `backend` and `celery-worker`. Inspect `docker compose ps` and `docker compose logs postgres`. + - `/health` reports `minio: error: bucket missing`: the lifespan bucket-create failed. Check `docker compose logs backend` for the `make_bucket` error. Likely cause: `MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY` mismatch — the lifespan client connects with app-level keys but MinIO only knows about the root user on first boot. Workaround for Phase 1: temporarily set `MINIO_ACCESS_KEY=$MINIO_ROOT_USER` and `MINIO_SECRET_KEY=$MINIO_ROOT_PASSWORD` in `.env` (Phase 2 will set up an app-level access policy via `mc admin user add` during MinIO init). + - Celery worker logs show `[ERROR/MainProcess] consumer: Cannot connect to redis://...`: the REDIS_PASSWORD or REDIS_URL is wrong, or the password contains a special character not URL-encoded. Re-confirm `REDIS_URL` form `redis://:@redis:6379/0`. + - Upload returns 500 with `MissingGreenlet`: a session attribute access happened after commit; verify `expire_on_commit=False` in `db/session.py`. + - Task never runs: `docker compose logs celery-worker` shows it can't import `tasks.document_tasks`; verify `tasks/__init__.py` exists and `celery_app.autodiscover_tasks(["tasks"])` is called. + + Type "approved" once steps 1-12 all pass. If any step fails, describe the failure mode and we resume with a fix plan. + + + + + +## Trust Boundaries + +| Boundary | Description | +|----------|-------------| +| Browser → FastAPI | HTTP/JSON (Phase 1: CORS `*` — Phase 2 locks down); multipart upload bytes traverse this boundary | +| FastAPI → Celery / Redis | Task payload is the document_id string only; no user input passed | +| FastAPI lifespan → MinIO | Bucket auto-create at startup; client persists on `app.state.minio` | +| Celery worker → MinIO + PostgreSQL | Worker re-fetches bytes from MinIO and reads/writes Document row | + +## STRIDE Threat Register + +| Threat ID | Category | Component | Disposition | Mitigation Plan | +|-----------|----------|-----------|-------------|-----------------| +| T-01-05-01 | Spoofing | Unauthenticated upload endpoint accepting any client | accept | Phase 1 has no auth (D-03 — user_id nullable); upload accessible to anyone reaching `localhost:8000`. Phase 2 adds JWT + CSRF + rate-limit. Documented in SKELETON.md "Out of Scope". | +| T-01-05-02 | Tampering | MIME-type spoofing on upload | mitigate | The existing `ALLOWED_MIME_TYPES` set in `api/documents.py` is preserved verbatim. Phase 4 (DOC-02) adds magic-byte verification before download/preview. | +| T-01-05-03 | Information Disclosure | `/health` revealing internal error class names | mitigate | `/health` error strings format as `f"error: {type(e).__name__}: {e}"` — exposes Python exception class name, which is acceptable for an internal/dev endpoint in Phase 1. Phase 2 will trim to `"error"` or `"unhealthy"` once the endpoint is reachable from the internet. Documented note in `main.py`. | +| T-01-05-04 | Tampering | Celery task receives untrusted document_id and might query arbitrary rows | mitigate | `extract_and_classify` only takes a `document_id` string from the upload path — never from a user query parameter. Task code does `session.get(Document, uuid.UUID(document_id))` which raises `ValueError` for non-UUID input; no SQL injection vector. Document row lookup is single-row by primary key only. | +| T-01-05-05 | Denial of Service | Lifespan bucket-create on every reboot blocks startup | mitigate | `if not bucket_exists: make_bucket` is idempotent — fast on warm starts. If MinIO is unreachable at startup, lifespan raises and the FastAPI app fails to boot — this is intentional and surfaces the failure to Compose's `depends_on: condition: service_healthy` (which gated startup but cannot catch a post-startup MinIO crash). | +| T-01-05-06 | Information Disclosure | `app.state.minio` reused across handlers | accept | The client holds connection state but no per-user credentials. All app handlers see the same `app.state.minio` — acceptable since Phase 1 has no per-user isolation. Phase 5 will introduce per-user `StorageBackend` instances for cloud backends. | +| T-01-05-SC | Tampering | npm/pip installs | N/A | No new package installs in this plan — all dependencies were added in Plan 01 and verified via RESEARCH.md Package Legitimacy Audit. | + + + +- Tasks 1-3 are autonomous; Task 4 is a blocking human-verify checkpoint. +- After Task 4 approval, ROADMAP.md Phase 1 success criteria #1 (docker compose up healthy), #3 (extract + classify pipeline works), and #4 (MinIO key schema enforced) are all live-verified. Criterion #2 was verified in Plan 03 Task 3. +- `docker compose exec -e INTEGRATION=1 backend bash -lc "cd /app && pytest tests/ -v"` exits 0 with zero FAILED. + + + +- `backend/main.py` lifespan creates the MinIO bucket and disposes the engine; `/health` returns the postgres+minio shape per D-07. +- `backend/api/documents.py` and `backend/api/topics.py` are entirely async-session-driven; upload-time classification is queued via Celery `.delay()`. +- `backend/celery_app.py` and `backend/tasks/document_tasks.py` are wired and discoverable. +- `backend/services/classifier.py` accepts an `AsyncSession`. +- `backend/config.py` is pruned of legacy flat-file constants. +- `backend/data/` is deleted; `tests/test_documents.py` is async-only; `tests/conftest.py` no longer ships a sync TestClient fixture. +- `docker compose up` boots healthy; the walking skeleton end-to-end check from Task 4 passes. + + + +Create `.planning/phases/01-infrastructure-foundation/01-05-SUMMARY.md` when done. Include: the exact `/health` JSON response observed at step 6 of Task 4, the actual MinIO object key produced at step 7-9, the Celery task log line from step 10, and any deviations from the plan (e.g., the temporary MinIO-root-as-app-key workaround called out in `if-broken`). + diff --git a/.planning/phases/01-infrastructure-foundation/01-PATTERNS.md b/.planning/phases/01-infrastructure-foundation/01-PATTERNS.md new file mode 100644 index 0000000..9c76511 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-PATTERNS.md @@ -0,0 +1,1082 @@ +# Phase 1: Infrastructure Foundation - Pattern Map + +**Mapped:** 2026-05-21 +**Files analyzed:** 14 new/modified files +**Analogs found:** 12 / 14 + +--- + +## File Classification + +| New/Modified File | Role | Data Flow | Closest Analog | Match Quality | +|-------------------|------|-----------|----------------|---------------| +| `docker-compose.yml` | config | request-response | `docker-compose.yml` (current) | exact — extend in-place | +| `docker/postgres/initdb.d/01-init-users.sql` | config | batch | none in codebase | no analog | +| `backend/db/session.py` | config | CRUD | `backend/config.py` (module-level setup pattern) | partial | +| `backend/db/models.py` | model | CRUD | none in codebase | no analog (schema from RESEARCH.md) | +| `backend/deps/db.py` | utility | CRUD | `backend/config.py` (module-level constants pattern) | partial | +| `backend/config.py` | config | request-response | `backend/config.py` (current) | exact — extend in-place | +| `backend/main.py` | config | request-response | `backend/main.py` (current) | exact — extend in-place | +| `backend/storage/base.py` | utility | request-response | `backend/ai/base.py` | exact role-match | +| `backend/storage/__init__.py` | utility | request-response | `backend/ai/__init__.py` | exact role-match | +| `backend/storage/minio_backend.py` | service | file-I/O | `backend/ai/openai_provider.py` | role-match (ABC impl) | +| `backend/services/storage.py` | service | CRUD | `backend/services/storage.py` (current) | exact — replace in-place | +| `backend/celery_app.py` | config | event-driven | none in codebase | no analog | +| `backend/tasks/document_tasks.py` | service | event-driven | `backend/services/classifier.py` | role-match (orchestration) | +| `backend/api/documents.py` | controller | request-response | `backend/api/documents.py` (current) | exact — update in-place | +| `backend/api/topics.py` | controller | request-response | `backend/api/topics.py` (current) | exact — update in-place | +| `backend/requirements.txt` | config | — | `backend/requirements.txt` (current) | exact — extend in-place | +| `.env.example` | config | — | `.env.example` (current) | exact — extend in-place | +| `backend/tests/conftest.py` | test | CRUD | `backend/tests/conftest.py` (current) | exact — update in-place | +| `backend/tests/test_health.py` | test | request-response | `backend/tests/test_health.py` (current) | exact — update in-place | +| `backend/tests/test_documents.py` | test | CRUD | `backend/tests/test_documents.py` (current) | exact — update in-place | +| `backend/tests/test_storage.py` | test | file-I/O | none in codebase | no analog (new) | +| `backend/alembic.ini` | config | — | none in codebase | no analog | +| `backend/migrations/env.py` | config | batch | none in codebase | no analog (pattern from RESEARCH.md) | +| `backend/migrations/versions/0001_initial_schema.py` | migration | batch | none in codebase | no analog (schema from RESEARCH.md) | + +--- + +## Pattern Assignments + +### `docker-compose.yml` (config, request-response) + +**Analog:** `docker-compose.yml` (current, lines 1–26) + +**Existing service block pattern** (lines 1–26 of current `docker-compose.yml`): +```yaml +services: + backend: + build: ./backend + ports: + - "8000:8000" + volumes: + - ./backend/data:/app/data + - ./backend:/app + environment: + - DATA_DIR=/app/data + - PYTHONDONTWRITEBYTECODE=1 + extra_hosts: + - "host.docker.internal:host-gateway" + command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload + + frontend: + build: ./frontend + ports: + - "5173:5173" + volumes: + - ./frontend/src:/app/src + - ./frontend/index.html:/app/index.html + depends_on: + - backend + command: npm run dev -- --host 0.0.0.0 +``` + +**New services to add — copy structure from RESEARCH.md Pattern 6** (lines 512–567): +```yaml + postgres: + image: postgres:17-alpine + environment: + POSTGRES_DB: docuvault + POSTGRES_USER: postgres + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + volumes: + - postgres_data:/var/lib/postgresql/data + - ./docker/postgres/initdb.d:/docker-entrypoint-initdb.d:ro + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d docuvault"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + minio: + image: minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD} + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + + redis: + image: redis:7-alpine + command: redis-server --requirepass ${REDIS_PASSWORD} + healthcheck: + test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"] + interval: 10s + timeout: 3s + retries: 5 + + celery-worker: + build: ./backend + command: celery -A celery_app worker --loglevel=info -Q documents + environment: + - DATABASE_URL=${DATABASE_URL} + - REDIS_URL=${REDIS_URL} + - MINIO_ENDPOINT=${MINIO_ENDPOINT} + - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} + - MINIO_SECRET_KEY=${MINIO_SECRET_KEY} + - MINIO_BUCKET=${MINIO_BUCKET} + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + minio: + condition: service_healthy +``` + +**`backend` service update — add `depends_on` conditions:** +```yaml + backend: + ... + environment: + - DATABASE_URL=${DATABASE_URL} + - DATABASE_MIGRATE_URL=${DATABASE_MIGRATE_URL} + - MINIO_ENDPOINT=${MINIO_ENDPOINT} + - MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} + - MINIO_SECRET_KEY=${MINIO_SECRET_KEY} + - MINIO_BUCKET=${MINIO_BUCKET} + - REDIS_URL=${REDIS_URL} + - PYTHONDONTWRITEBYTECODE=1 + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_healthy + redis: + condition: service_healthy +``` + +**Remove** the `volumes:` entry for `./backend/data:/app/data` — flat-file storage is deleted (D-04). + +**Add named volumes block at end of file:** +```yaml +volumes: + postgres_data: + minio_data: +``` + +--- + +### `backend/config.py` (config, request-response) + +**Analog:** `backend/config.py` (current, lines 1–52) + +**Existing pattern** (lines 1–10 — module-level constants, NOT Pydantic Settings): +```python +import json +import os +from pathlib import Path + +DATA_DIR = Path(os.environ.get("DATA_DIR", "/app/data")) +UPLOADS_DIR = DATA_DIR / "uploads" +METADATA_DIR = DATA_DIR / "metadata" +TOPICS_FILE = DATA_DIR / "topics.json" +SETTINGS_FILE = DATA_DIR / "settings.json" +``` + +**Replace entirely with Pydantic Settings** (per RESEARCH.md Code Examples, lines 914–937). +The existing `config.py` does not use `pydantic-settings` — Phase 1 introduces it. The pattern to follow is the RESEARCH.md example, not the current file. Keep the `DEFAULT_SYSTEM_PROMPT` and `DEFAULT_SETTINGS` constants for backward compatibility during the transition; remove `ensure_data_dirs()` and all path constants once `services/storage.py` is replaced. + +**New pattern:** +```python +# backend/config.py +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + # Legacy — keep during transition, remove after storage.py rewrite + data_dir: str = "/app/data" + + # Phase 1 additions + database_url: str = "postgresql+psycopg://docuvault_app:changeme@postgres/docuvault" + database_migrate_url: str = "postgresql+psycopg://docuvault_migrate:changeme@postgres/docuvault" + minio_endpoint: str = "minio:9000" + minio_access_key: str = "docuvault_app" + minio_secret_key: str = "changeme" + minio_bucket: str = "docuvault" + redis_url: str = "redis://:changeme@redis:6379/0" + secret_key: str = "CHANGEME" # documented for Phase 2; not read in Phase 1 + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + +settings = Settings() +``` + +Note: `pydantic-settings` is already in `requirements.txt` (line 4). No new dependency needed. + +--- + +### `backend/main.py` (config, request-response) + +**Analog:** `backend/main.py` (current, lines 1–34) + +**Existing lifespan pattern** (lines 10–14): +```python +from contextlib import asynccontextmanager +from fastapi import FastAPI + +@asynccontextmanager +async def lifespan(app: FastAPI): + ensure_data_dirs() + yield +``` + +**Extend lifespan** — replace `ensure_data_dirs()` call with engine setup and MinIO bucket init. Copy the `asynccontextmanager` + `yield` structure exactly: +```python +from contextlib import asynccontextmanager +import asyncio +from fastapi import FastAPI +from minio import Minio +from db.session import engine +from config import settings + +@asynccontextmanager +async def lifespan(app: FastAPI): + # MinIO bucket initialization + minio_client = Minio( + settings.minio_endpoint, + access_key=settings.minio_access_key, + secret_key=settings.minio_secret_key, + secure=False, + ) + exists = await asyncio.to_thread(minio_client.bucket_exists, settings.minio_bucket) + if not exists: + await asyncio.to_thread(minio_client.make_bucket, settings.minio_bucket) + app.state.minio = minio_client + yield + # Shutdown: close all pooled connections + await engine.dispose() +``` + +**Extend `/health` endpoint** — keep existing route signature `@app.get("/health")` and `async def health()`, extend the body: +```python +@app.get("/health") +async def health(request: Request): + checks = {} + # PostgreSQL probe + try: + async with AsyncSessionLocal() as session: + await session.execute(text("SELECT 1")) + checks["postgres"] = "ok" + except Exception as e: + checks["postgres"] = f"error: {e}" + + # MinIO probe + try: + ok = await asyncio.to_thread(request.app.state.minio.bucket_exists, settings.minio_bucket) + checks["minio"] = "ok" if ok else "bucket missing" + except Exception as e: + checks["minio"] = f"error: {e}" + + overall = "ok" if all(v == "ok" for v in checks.values()) else "degraded" + return {"status": overall, "checks": checks} +``` + +--- + +### `backend/db/session.py` (config, CRUD) + +**Analog:** None exact. Closest structural analog is `backend/config.py` (module-level initialization pattern at lines 1–10). + +**Pattern from RESEARCH.md Pattern 1** (lines 240–266): +```python +# backend/db/session.py +from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession +from config import settings + +engine = create_async_engine( + settings.database_url, # postgresql+psycopg://docuvault_app:...@postgres/docuvault + pool_pre_ping=True, # detect stale connections before use + echo=False, +) + +AsyncSessionLocal = async_sessionmaker( + engine, + class_=AsyncSession, + expire_on_commit=False, # prevent MissingGreenlet errors after commit +) +``` + +**Key rule:** `expire_on_commit=False` is mandatory — see RESEARCH.md Pitfall 1. + +--- + +### `backend/deps/db.py` (utility, CRUD) + +**Analog:** None exact. The dependency injection `yield` pattern mirrors how `backend/tests/conftest.py` yields fixtures (lines 13–43). + +**Pattern from RESEARCH.md Pattern 1** (lines 258–266): +```python +# backend/deps/db.py +from db.session import AsyncSessionLocal + +async def get_db(): + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() +``` + +Use as a FastAPI dependency: `session: AsyncSession = Depends(get_db)`. + +--- + +### `backend/db/models.py` (model, CRUD) + +**Analog:** None in codebase. The full schema is specified in RESEARCH.md Code Examples (lines 769–908). + +**Import block to copy:** +```python +import uuid +from datetime import datetime, timezone +from sqlalchemy import ( + Boolean, BigInteger, ForeignKey, Index, String, Text, + TIMESTAMP, UniqueConstraint, Integer +) +from sqlalchemy.dialects.postgresql import UUID, INET, JSONB +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +from sqlalchemy.sql import func +``` + +**Base class pattern:** +```python +class Base(DeclarativeBase): + pass +``` + +**Critical D-03:** `Document.user_id` must be `nullable=True` in Phase 1: +```python +user_id: Mapped[uuid.UUID | None] = mapped_column( + UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=True +) +``` + +Use the full schema from RESEARCH.md lines 788–908 verbatim — it was designed to be implementation-ready. + +--- + +### `backend/storage/base.py` (utility, request-response) + +**Analog:** `backend/ai/base.py` (lines 1–33) — exact structural match. + +**ABC pattern from `backend/ai/base.py`** (lines 1–33): +```python +from abc import ABC, abstractmethod +from dataclasses import dataclass, field + +class AIProvider(ABC): + @abstractmethod + async def classify(self, ...) -> ClassificationResult: ... + + @abstractmethod + async def health_check(self) -> bool: ... +``` + +**Apply same structure** for `StorageBackend`. The `health_check()` abstract method is already present in `ai/base.py` (line 31) — mirror it exactly in `StorageBackend`. Method signatures from RESEARCH.md Pattern 8 (lines 617–640): +```python +# backend/storage/base.py +from abc import ABC, abstractmethod + +class StorageBackend(ABC): + @abstractmethod + async def put_object( + self, user_id: str, document_id: str, + file_bytes: bytes, extension: str, content_type: str, + ) -> str: + """Store object; return the object_key used.""" + + @abstractmethod + async def get_object(self, object_key: str) -> bytes: + """Retrieve object bytes by key.""" + + @abstractmethod + async def delete_object(self, object_key: str) -> None: + """Delete object by key.""" + + @abstractmethod + async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str: + """Return a time-limited download URL.""" + + @abstractmethod + async def health_check(self) -> bool: + """Return True if backend is reachable.""" +``` + +--- + +### `backend/storage/__init__.py` (utility, request-response) + +**Analog:** `backend/ai/__init__.py` (lines 1–36) — exact structural match. + +**Factory pattern from `backend/ai/__init__.py`** (lines 1–10 and 8–36): +```python +from ai.base import AIProvider, ClassificationResult +from ai.anthropic_provider import AnthropicProvider +# ... more imports + +def get_provider(settings: dict) -> AIProvider: + active = settings.get("active_provider", "lmstudio") + match active: + case "anthropic": + return AnthropicProvider(...) + case _: + raise ValueError(f"Unknown AI provider: {active}") +``` + +**Apply same factory pattern** for storage. Phase 1 has only one backend (MinIO), so the `match` can be omitted initially, but the factory function signature is mandatory: +```python +# backend/storage/__init__.py +from config import settings +from storage.minio_backend import MinIOBackend +from storage.base import StorageBackend + +def get_storage_backend() -> StorageBackend: + return MinIOBackend( + endpoint=settings.minio_endpoint, + access_key=settings.minio_access_key, + secret_key=settings.minio_secret_key, + bucket=settings.minio_bucket, + secure=False, + ) +``` + +--- + +### `backend/storage/minio_backend.py` (service, file-I/O) + +**Analog:** `backend/ai/openai_provider.py` (lines 1–104) — same ABC-implementation pattern. + +**ABC implementation pattern from `backend/ai/openai_provider.py`** (lines 9–70): +```python +class OpenAIProvider(AIProvider): + def __init__(self, api_key: str, model: str = "gpt-4o", base_url: str | None = None): + self._api_key = api_key + self._model = model + self._base_url = base_url + + def _client(self) -> AsyncOpenAI: + return AsyncOpenAI(api_key=self._api_key or "placeholder", base_url=self._base_url) + + async def health_check(self) -> bool: + try: + await self._client().chat.completions.create(...) + return True + except Exception: + return False +``` + +Copy this structure: `__init__` stores config, private `_client` attribute holds SDK instance, every method is `async def`, `health_check` wraps in `try/except` returning `bool`. + +**Key difference from AI providers:** MinIO SDK is synchronous — all calls must be wrapped in `asyncio.to_thread()`. Copy the wrapping pattern from RESEARCH.md Pattern 3 (lines 349–403): +```python +import asyncio +import io +import uuid + +class MinIOBackend(StorageBackend): + def __init__(self, endpoint, access_key, secret_key, bucket, secure=False): + self._client = Minio(endpoint=endpoint, access_key=access_key, + secret_key=secret_key, secure=secure) + self._bucket = bucket + + async def put_object(self, user_id, document_id, file_bytes, extension, content_type) -> str: + object_key = f"{user_id}/{document_id}/{uuid.uuid4()}{extension}" + data = io.BytesIO(file_bytes) # BytesIO() constructor sets pointer at 0 — no seek(0) needed + await asyncio.to_thread( + self._client.put_object, + self._bucket, object_key, data, length=len(file_bytes), content_type=content_type, + ) + return object_key + + async def health_check(self) -> bool: + try: + return await asyncio.to_thread(self._client.bucket_exists, self._bucket) + except Exception: + return False +``` + +--- + +### `backend/services/storage.py` (service, CRUD) + +**Analog:** `backend/services/storage.py` (current, lines 1–188) — replace entirely. + +**Current pattern shows the data-access interface** that `api/documents.py` depends on (lines 18–95). The new implementation must preserve the same function signatures where possible to minimize changes in `api/documents.py`. The new `storage.py` is a thin orchestrator: it calls `db/session.py` for ORM operations and `storage/minio_backend.py` for object storage. + +**New async signatures to match existing callers in `api/documents.py` (lines 32–57):** +```python +# Old (sync): storage.save_upload(content, file.filename, mime) +# New (async): await storage.save_upload(content, file.filename, mime) + +# Old (sync): storage.save_metadata(meta) +# New (async): await storage.save_metadata(meta) — or merged into save_upload + +# Old (sync): storage.list_metadata(topic=topic) +# New (async): await storage.list_metadata(topic=topic) + +# Old (sync): storage.get_metadata(doc_id) +# New (async): await storage.get_metadata(doc_id) + +# Old (sync): storage.delete_document(doc_id) +# New (async): await storage.delete_document(doc_id) +``` + +**Session injection pattern:** New `storage.py` functions accept an `AsyncSession` parameter (injected by the FastAPI dependency via `Depends(get_db)`), not create their own. This mirrors how the classifier calls storage functions with state passed in. + +**Error handling from current `storage.py`** (lines 34–38 — return `None` for not-found, not exceptions): +```python +def get_metadata(doc_id: str) -> dict | None: + path = METADATA_DIR / f"{doc_id}.json" + if not path.exists(): + return None + return json.loads(path.read_text()) +``` +Keep the same `None`-on-not-found contract in the async ORM version so `api/documents.py` `if meta is None: raise HTTPException(404, ...)` checks continue to work unchanged. + +--- + +### `backend/celery_app.py` (config, event-driven) + +**Analog:** None in codebase. + +**Pattern from RESEARCH.md Pattern 5** (lines 462–475): +```python +# backend/celery_app.py +import os +from celery import Celery + +celery_app = Celery("docuvault") +celery_app.conf.broker_url = os.environ.get("REDIS_URL", "redis://redis:6379/0") +celery_app.conf.result_backend = os.environ.get("REDIS_URL", "redis://redis:6379/0") +celery_app.conf.task_serializer = "json" +celery_app.conf.result_serializer = "json" +celery_app.conf.accept_content = ["json"] +celery_app.conf.task_routes = { + "tasks.document_tasks.*": {"queue": "documents"}, +} +``` + +**Critical:** Use `os.environ.get()` directly here, NOT `from config import settings`. `config.py` imports pydantic-settings, which may trigger FastAPI-related imports. Keep `celery_app.py` minimal to avoid Pitfall 7 (circular imports with the FastAPI app). + +--- + +### `backend/tasks/document_tasks.py` (service, event-driven) + +**Analog:** `backend/services/classifier.py` (lines 1–59) — same orchestration pattern (load metadata, load settings, call services, persist results). + +**Orchestration pattern from `backend/services/classifier.py`** (lines 11–46): +```python +async def classify_document(doc_id: str, topic_names: list[str] | None = None) -> list[str]: + meta = storage.get_metadata(doc_id) + if meta is None: + raise ValueError(f"Document {doc_id} not found") + + settings = storage.load_settings() + provider = get_provider(settings) + text = meta.get("extracted_text", "") + result = await provider.classify(text[:MAX_AI_CHARS], topic_names, system_prompt) + # ... persist results + storage.update_document_topics(doc_id, final_topics) + return final_topics +``` + +**Apply same orchestration structure** for the Celery task, with three critical differences: +1. Task function must be `def`, not `async def` (Celery workers have no asyncio event loop) +2. Import services directly — never import from `main.py` or any router module +3. Use `asyncio.run()` to call async service functions if unavoidable + +```python +# backend/tasks/document_tasks.py +from celery_app import celery_app + +@celery_app.task(name="tasks.document_tasks.extract_and_classify") +def extract_and_classify(document_id: str) -> dict: + import asyncio + from services import extractor, classifier + # ... call services, persist results + return {"document_id": document_id, "status": "classified"} +``` + +**Replace in `api/documents.py`** (lines 49–56): +```python +# Old: +if auto_classify: + topics = await classifier.classify_document(saved["id"]) +# New: +from tasks.document_tasks import extract_and_classify +extract_and_classify.delay(str(saved_doc.id)) +``` + +--- + +### `backend/api/documents.py` (controller, request-response) + +**Analog:** `backend/api/documents.py` (current, lines 1–102) — update in-place. + +**Existing route structure to preserve** (lines 21–58): +- `@router.post("/upload")` — keep signature `(file: UploadFile, auto_classify: bool)` +- `@router.get("")` — keep pagination params `(topic, page, per_page)` +- `@router.get("/{doc_id}")` — keep path param +- `@router.delete("/{doc_id}")` — keep path param +- `@router.post("/{doc_id}/classify")` — keep path param + body + +**Session injection change — current** (lines 1–4): +```python +from services import storage, extractor, classifier +``` +**New** — add session dependency: +```python +from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query, Depends +from sqlalchemy.ext.asyncio import AsyncSession +from deps.db import get_db +from services import storage, extractor +from tasks.document_tasks import extract_and_classify +``` + +**Add `session` parameter to route handlers:** +```python +@router.post("/upload") +async def upload_document( + file: UploadFile = File(...), + auto_classify: bool = Form(True), + session: AsyncSession = Depends(get_db), # NEW +): +``` + +**Error handling pattern** (lines 50–56 — keep unchanged): +```python +try: + topics = await classifier.classify_document(saved["id"]) + meta["topics"] = topics +except Exception as e: + meta["classification_error"] = str(e) # classification failure is non-fatal +``` + +**HTTP error pattern** (lines 75–77 — keep unchanged): +```python +if meta is None: + raise HTTPException(404, "Document not found") +``` + +--- + +### `backend/api/topics.py` (controller, request-response) + +**Analog:** `backend/api/topics.py` (current, lines 1–73) — update in-place. + +**Existing Pydantic model pattern** (lines 8–19): +```python +class TopicCreate(BaseModel): + name: str + description: str = "" + color: str = "#6366f1" + +class TopicUpdate(BaseModel): + name: str | None = None + description: str | None = None + color: str | None = None +``` +Keep these models unchanged — they match the PostgreSQL `topics` table columns. + +**Storage call pattern** (lines 26–30): +```python +@router.get("") +async def list_topics(): + topics = storage.load_topics() + counts = storage.topic_doc_counts() +``` +Update to inject `session: AsyncSession = Depends(get_db)` and call async ORM queries instead of flat-file storage functions. Response shape must remain identical (`{"topics": [...]}` with `doc_count` appended per topic). + +--- + +### `backend/requirements.txt` (config) + +**Analog:** `backend/requirements.txt` (current, lines 1–16) + +**Current file** (lines 1–16): +``` +fastapi>=0.111 +uvicorn[standard]>=0.29 +python-multipart +pydantic-settings>=2.2 +anthropic>=0.26 +openai>=1.30 +PyMuPDF>=1.24 +python-docx>=1.1 +pytesseract>=0.3 +Pillow>=10.3 +filelock>=3.14 # REMOVE — replaced by PostgreSQL transactions +aiofiles>=23.2 +httpx>=0.27 +pytest>=8.2 +pytest-asyncio>=0.23 +``` + +**Additions (append to file):** +``` +sqlalchemy[asyncio]>=2.0 +psycopg[binary]>=3.3 +alembic>=1.13 +minio>=7.2 +celery[redis]>=5.4 +redis>=7.0 +``` + +**Remove:** `filelock>=3.14` — no longer needed once `services/storage.py` is replaced (RESEARCH.md line 952). + +--- + +### `.env.example` (config) + +**Analog:** `.env.example` (current, lines 1–6) + +**Current file** (lines 1–6): +```bash +# Copy to .env and fill in as needed. +ANTHROPIC_API_KEY= +OPENAI_API_KEY= +``` + +**Extend with all Phase 1 vars** (D-11, D-13, D-15, D-16). Keep existing vars at top. Pattern: group by service, comment each variable: +```bash +# ── PostgreSQL ─────────────────────────────────────────────────────────────── +# App user (restricted: SELECT/INSERT/UPDATE/DELETE only — used by FastAPI + Celery) +DATABASE_URL=postgresql+psycopg://docuvault_app:changeme@postgres:5432/docuvault +# Migration user (DDL privileges — used ONLY by Alembic, never by the app at runtime) +DATABASE_MIGRATE_URL=postgresql+psycopg://docuvault_migrate:changeme@postgres:5432/docuvault +# Superuser password for the postgres init container (used only by initdb.d scripts) +POSTGRES_PASSWORD=changeme + +# ── MinIO ──────────────────────────────────────────────────────────────────── +MINIO_ROOT_USER=minioadmin +MINIO_ROOT_PASSWORD=changeme +MINIO_ENDPOINT=minio:9000 +# App-level access key (minimal permissions: read/write on docuvault bucket only) +MINIO_ACCESS_KEY=docuvault_app +MINIO_SECRET_KEY=changeme +MINIO_BUCKET=docuvault + +# ── Redis ──────────────────────────────────────────────────────────────────── +REDIS_PASSWORD=changeme +REDIS_URL=redis://:changeme@redis:6379/0 + +# ── Security (Phase 2) ─────────────────────────────────────────────────────── +# Not read by the app in Phase 1. Documented here for Phase 2 JWT + HKDF use. +SECRET_KEY=CHANGEME-replace-with-64-char-random-hex +``` + +--- + +### `backend/tests/conftest.py` (test, CRUD) + +**Analog:** `backend/tests/conftest.py` (current, lines 1–71) — update in-place. + +**Current fixture pattern** (lines 13–43): +```python +@pytest.fixture(autouse=True) +def isolated_data_dir(monkeypatch, tmp_path): + """Each test gets its own clean data directory.""" + data_dir = tmp_path / "data" + ... + monkeypatch.setenv("DATA_DIR", str(data_dir)) + import config + monkeypatch.setattr(config, "DATA_DIR", data_dir) + ... + yield data_dir +``` + +**New async session fixture** — replace `isolated_data_dir` with an async SQLite in-memory engine for unit tests, and keep a separate fixture for integration tests using the real Docker database. Copy the `yield` + teardown structure exactly: +```python +import pytest +import pytest_asyncio +from httpx import AsyncClient, ASGITransport +from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession +from sqlalchemy.pool import StaticPool +from db.models import Base +from deps.db import get_db +from main import app + +@pytest_asyncio.fixture +async def db_session(): + """In-memory async SQLite session for unit tests.""" + engine = create_async_engine( + "sqlite+aiosqlite:///:memory:", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + AsyncTestSession = async_sessionmaker(engine, expire_on_commit=False) + async with AsyncTestSession() as session: + yield session + + await engine.dispose() + +@pytest_asyncio.fixture +async def client(db_session): + """Async test client with DB dependency overridden.""" + app.dependency_overrides[get_db] = lambda: db_session + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as c: + yield c + app.dependency_overrides.clear() +``` + +Note: `aiosqlite` must be added to `requirements.txt` for tests. Alternatively, pin to the real PostgreSQL test database via `DATABASE_URL` env var in integration tests. + +--- + +### `backend/tests/test_health.py` (test, request-response) + +**Analog:** `backend/tests/test_health.py` (current, lines 1–5) — update in-place. + +**Current test** (lines 1–5): +```python +def test_health(client): + resp = client.get("/health") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} +``` + +**Extended pattern** — keep the existing test function name; add new assertions for the richer response shape. Use the `async/await` style required by `pytest-asyncio`: +```python +import pytest + +async def test_health_ok(client): + resp = await client.get("/health") + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "ok" + +async def test_health_checks_postgres_and_minio(client): + resp = await client.get("/health") + data = resp.json() + assert "checks" in data + assert "postgres" in data["checks"] + assert "minio" in data["checks"] + assert data["checks"]["postgres"] == "ok" + assert data["checks"]["minio"] == "ok" +``` + +--- + +### `backend/tests/test_documents.py` (test, CRUD) + +**Analog:** `backend/tests/test_documents.py` (current, lines 1–108) — port to async. + +**Current sync pattern** (lines 1–14): +```python +def test_upload_txt_no_classify(client, sample_txt): + with open(sample_txt, "rb") as f: + resp = client.post( + "/api/documents/upload", + files={"file": ("sample.txt", f, "text/plain")}, + data={"auto_classify": "false"}, + ) + assert resp.status_code == 200 +``` + +**Port to async — change `def` to `async def` and `client.post` to `await client.post`:** +```python +async def test_upload_txt_no_classify(client, sample_txt): + with open(sample_txt, "rb") as f: + resp = await client.post( + "/api/documents/upload", + files={"file": ("sample.txt", f, "text/plain")}, + data={"auto_classify": "false"}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["original_name"] == "sample.txt" +``` + +Keep all assertion logic from the current file — only the `def`→`async def` and `client.verb()`→`await client.verb()` changes are needed. Add new tests for STORE-01 and STORE-02 requirements. + +--- + +### `backend/tests/test_storage.py` (test, file-I/O) + +**Analog:** None in codebase — new file. + +**Pattern from RESEARCH.md Validation section** (lines 1022–1028) and the MinIO key schema (D-06): +```python +import pytest +import re + +async def test_object_key_schema(db_session): + """STORE-02: MinIO object key must match {user_id}/{document_id}/{uuid4}{ext}.""" + from storage.minio_backend import MinIOBackend + # Use a mock or capture the key returned by put_object + key = f"user-123/doc-456/{uuid.uuid4()}.pdf" + pattern = re.compile( + r'^[0-9a-f-]{36}/[0-9a-f-]{36}/[0-9a-f-]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.[a-z]+$' + ) + assert pattern.match(key) + +async def test_filename_not_in_object_key(): + """STORE-02: Human-readable filename must NOT appear in the MinIO object key.""" + original_name = "invoice_Q3_2025.pdf" + # The key returned by MinIOBackend.put_object must not contain the original name + from storage.minio_backend import MinIOBackend + # ... call with mock Minio client, assert key does not contain original_name + assert original_name not in generated_key +``` + +--- + +### `docker/postgres/initdb.d/01-init-users.sql` (config, batch) + +**Analog:** None in codebase. + +**Pattern from RESEARCH.md Pattern 7** (lines 581–599): +```sql +-- docker/postgres/initdb.d/01-init-users.sql +-- Runs as the POSTGRES_USER superuser on first container start only. + +-- Migration user: DDL privileges (CREATE TABLE, ALTER TABLE, CREATE INDEX) +CREATE USER docuvault_migrate WITH PASSWORD 'PLACEHOLDER_MIGRATE_PASSWORD'; +GRANT ALL PRIVILEGES ON DATABASE docuvault TO docuvault_migrate; + +-- App user: runtime DML only (SELECT, INSERT, UPDATE, DELETE) +CREATE USER docuvault_app WITH PASSWORD 'PLACEHOLDER_APP_PASSWORD'; +GRANT CONNECT ON DATABASE docuvault TO docuvault_app; +``` + +**Important:** Passwords here are Docker init-time placeholders. The actual passwords come from `.env` via `docker-compose.yml` environment vars. The init script runs once on empty volume — it cannot read env vars directly, so passwords must be hardcoded (and should match what's in `.env`). + +The `ALTER DEFAULT PRIVILEGES` grant (for future tables created by Alembic) must be run inside the first Alembic migration (`0001_initial_schema.py`) using `op.execute()`, not in this init script — see RESEARCH.md Pattern 7 (lines 601–603) and Pitfall 4. + +--- + +### `backend/alembic.ini` and `backend/migrations/env.py` (config, batch) + +**Analog:** None in codebase. + +**`alembic.ini` key section** (from RESEARCH.md Pattern 2, lines 328–334): +```ini +[alembic] +script_location = migrations +sqlalchemy.url = %(DATABASE_MIGRATE_URL)s +``` + +**`migrations/env.py` async pattern** (from RESEARCH.md Pattern 2, lines 300–327): +```python +import asyncio +from sqlalchemy.ext.asyncio import async_engine_from_config +from sqlalchemy import pool +from alembic import context +from db.models import Base # noqa: F401 — must import to register all models + +target_metadata = Base.metadata + +def do_run_migrations(connection): + context.configure(connection=connection, target_metadata=target_metadata) + with context.begin_transaction(): + context.run_migrations() + +async def run_async_migrations(): + connectable = async_engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + await connectable.dispose() + +def run_migrations_online(): + asyncio.run(run_async_migrations()) +``` + +Generate the base file with `alembic init -t async migrations` — it produces this exact structure. Then add the `from db.models import Base` import and set `target_metadata = Base.metadata`. + +--- + +## Shared Patterns + +### Async/Await Convention +**Source:** `backend/main.py` (lines 10–13), `backend/api/documents.py` (lines 21–58) +**Apply to:** All new `db/`, `deps/`, `storage/`, `services/`, `tasks/` modules, all test files + +All new code is `async def`. Synchronous SDK calls (MinIO) use `asyncio.to_thread()`. Celery task functions are the only exception: they must be plain `def` (see RESEARCH.md Pitfall: Celery tasks are synchronous). + +### None-on-not-found Contract +**Source:** `backend/services/storage.py` (lines 34–38) +**Apply to:** `backend/services/storage.py` (rewritten), `backend/db/` query helpers + +```python +def get_metadata(doc_id: str) -> dict | None: + ... + if not path.exists(): + return None +``` +Async ORM equivalent: +```python +async def get_document(session: AsyncSession, doc_id: uuid.UUID) -> Document | None: + return await session.get(Document, doc_id) +``` +Return `None` for not-found; let the API layer raise `HTTPException(404)`. Never raise exceptions from the service layer for expected missing-resource conditions. + +### HTTP Error Pattern +**Source:** `backend/api/documents.py` (lines 74–77), `backend/api/topics.py` (lines 57–59) +**Apply to:** All API route handlers +```python +if meta is None: + raise HTTPException(404, "Document not found") +``` +Use bare string messages (no `detail=` keyword) — consistent with existing code. + +### Classification Failure Non-Fatal Pattern +**Source:** `backend/api/documents.py` (lines 50–56) +**Apply to:** `backend/api/documents.py` (updated upload handler) +```python +try: + topics = await classifier.classify_document(saved["id"]) + meta["topics"] = topics +except Exception as e: + meta["classification_error"] = str(e) # classification failure is non-fatal +``` +Document upload succeeds even if classification fails. Celery task failure equivalent: task enters FAILURE state but the document row remains with `status="pending"`. + +### ABC + Factory Pattern +**Source:** `backend/ai/base.py` + `backend/ai/__init__.py` (lines 1–36) +**Apply to:** `backend/storage/base.py` + `backend/storage/__init__.py` + +This is the project's established pattern for pluggable backends. Follow it exactly: separate `base.py` (ABC), `__init__.py` (factory function `get_X_backend()`), concrete implementations in separate modules. + +--- + +## No Analog Found + +Files with no close match in the codebase (planner should use RESEARCH.md patterns instead): + +| File | Role | Data Flow | Reason | +|------|------|-----------|--------| +| `docker/postgres/initdb.d/01-init-users.sql` | config | batch | No SQL scripts exist in codebase; use RESEARCH.md Pattern 7 | +| `backend/celery_app.py` | config | event-driven | No task queue code exists; use RESEARCH.md Pattern 5 | +| `backend/alembic.ini` | config | batch | No Alembic config exists; generate with `alembic init -t async` | +| `backend/migrations/env.py` | config | batch | No migrations exist; use `alembic init -t async` output + RESEARCH.md Pattern 2 | +| `backend/migrations/versions/0001_initial_schema.py` | migration | batch | No migrations exist; use full schema from RESEARCH.md Code Examples (lines 769–908) | +| `backend/tests/test_storage.py` | test | file-I/O | No object storage tests exist; new file per RESEARCH.md Validation section | + +--- + +## Metadata + +**Analog search scope:** `backend/` (all `.py` files), `docker-compose.yml`, `.env.example`, `backend/requirements.txt`, `backend/Dockerfile` +**Files scanned:** 25 +**Pattern extraction date:** 2026-05-21 diff --git a/.planning/phases/01-infrastructure-foundation/01-RESEARCH.md b/.planning/phases/01-infrastructure-foundation/01-RESEARCH.md new file mode 100644 index 0000000..a11fc35 --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-RESEARCH.md @@ -0,0 +1,1104 @@ +# Phase 1: Infrastructure Foundation - Research + +**Researched:** 2026-05-21 +**Domain:** PostgreSQL + MinIO + Redis + Celery wired into FastAPI via Docker Compose; Alembic async migrations; storage service rewrite +**Confidence:** HIGH + +--- + + +## User Constraints (from CONTEXT.md) + +### Locked Decisions + +**Schema Scope** +- D-01: Phase 1 initial Alembic migration creates the full v1 skeleton — all tables: `users`, `refresh_tokens`, `quotas`, `documents`, `topics`, `folders`, `shares`, `audit_log`, `cloud_connections`. Subsequent phases add data and constraints, not new tables. +- D-02: `groups` table stub included in Phase 1 migration (v2 feature; empty table, correct columns and FKs). +- D-03: `documents.user_id` is nullable in Phase 1 (no auth system yet). Phase 2 migration adds the NOT NULL constraint after the user/auth system is live. +- D-04: Existing `data/` directory contents (flat-file JSON metadata + uploaded files) are deleted in Phase 1. Test data only — no migration script needed. + +**App Wiring** +- D-05: Phase 1 switches the storage service layer to PostgreSQL + MinIO. `backend/services/storage.py` is rewritten to use async SQLAlchemy + MinIO SDK. The app does not continue using the filesystem after Phase 1. +- D-06: Single MinIO bucket named `docuvault`. Object keys follow `{user_id}/{document_id}/{uuid4()}{ext}` (STORE-02). Human-readable filenames stored in the `documents.filename` DB column only — never in the MinIO key. +- D-07: `backend/main.py` `/health` endpoint extended to check PostgreSQL + MinIO connectivity (not just `{"status": "ok"}`). Health checks gate `docker compose up` readiness. + +**Background Worker** +- D-08: Background task queue: Celery + Redis (STORE-08). FastAPI `BackgroundTasks` replaced. +- D-09: Redis service added to `docker-compose.yml` in Phase 1. Redis doubles as the rate-limiting store for Phase 2 auth endpoints — no second Redis needed later. +- D-10: A `celery-worker` service is added to `docker-compose.yml`. Celery broker and result backend both point to the same Redis instance via `REDIS_URL`. + +**Env / Secrets Strategy** +- D-11: `.env` gitignored + `.env.example` committed. `docker-compose.yml` reads vars via `${VAR_NAME}`. `.env.example` has safe placeholder values and comments explaining each variable. +- D-12: Production secrets stored outside the project directory at `/etc/docuvault/env` (`chmod 600`, owned by the service user, not root). `docker-compose.yml` references it via `env_file:`. Documented in deployment notes. +- D-13: Two PostgreSQL DSNs: `DATABASE_URL` (restricted app user `docuvault_app`, SELECT/INSERT/UPDATE/DELETE only; no DDL) and `DATABASE_MIGRATE_URL` (migration user `docuvault_migrate`, DDL privileges; used only by Alembic). +- D-14: PostgreSQL init script in `docker/postgres/initdb.d/` provisions both users on first container start. The app never connects as the PostgreSQL superuser. +- D-15: MinIO vars: `MINIO_ENDPOINT`, `MINIO_ROOT_USER`, `MINIO_ROOT_PASSWORD` (init only), `MINIO_BUCKET` (value: `docuvault`), `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY` (separate app-level access key pair with minimal bucket permissions). +- D-16: Additional vars in Phase 1 `.env.example`: `REDIS_URL`, `SECRET_KEY` (documented now for Phase 2 JWT + HKDF use; app does not read it in Phase 1). + +### Claude's Discretion + +None — user made explicit choices for all areas. + +### Deferred Ideas (OUT OF SCOPE) + +None — discussion stayed within phase scope. + + +--- + + +## Phase Requirements + +| ID | Description | Research Support | +|----|-------------|------------------| +| STORE-01 | Platform storage layer migrated from flat-file JSON + local filesystem to PostgreSQL (metadata) + MinIO (objects) | SQLAlchemy 2.0 async ORM + MinIO SDK patterns documented; service rewrite approach confirmed | +| STORE-02 | Each user's MinIO objects use `{user_id}/{document_id}/{uuid4()}{ext}` keys — human-readable filenames stored in DB only | MinIO `put_object()` API confirmed; key schema enforced in model/service layer | +| STORE-07 | Backend is stateless — no per-instance file locks; multiple instances can run behind a load balancer | PostgreSQL atomic UPDATE + Celery + Redis replaces filelock pattern; verified | + + +--- + +## Summary + +Phase 1 replaces the entire flat-file persistence layer (JSON metadata + local filesystem uploads) with PostgreSQL (via SQLAlchemy 2.0 async ORM) + MinIO (via the official Python SDK) wired into Docker Compose. Redis and a Celery worker are added alongside as the background task queue that replaces FastAPI `BackgroundTasks`, delivering statelessness required by STORE-07. All infrastructure services are health-checked and ordered via `depends_on` conditions so `docker compose up` can be treated as the single operational command. Alembic manages the schema using the async migration template with a two-DSN strategy (restricted app user + DDL migration user). The walking skeleton requirement is satisfied by: the full v1 schema applied via Alembic, one real document upload persisted to PostgreSQL and MinIO through the rewritten storage service, and the `/health` endpoint returning live connectivity checks for all three services. + +The existing single-user document upload → text extraction → AI classification workflow continues to work end-to-end after Phase 1. The Vue frontend requires no changes. All API routes and response shapes are preserved. + +**Primary recommendation:** Wire infrastructure with Docker Compose health checks first; apply Alembic migration second; rewrite `services/storage.py` third; replace `BackgroundTasks` with Celery tasks last. This ordering allows each layer to be verified before the next is built. + +--- + +## Architectural Responsibility Map + +| Capability | Primary Tier | Secondary Tier | Rationale | +|------------|-------------|----------------|-----------| +| Document metadata persistence | Database / Storage (PostgreSQL) | API / Backend | All metadata is authored and read server-side; no client involvement | +| Binary file storage | Database / Storage (MinIO) | API / Backend | Object store owns bytes; backend generates keys and proxies operations | +| Background text extraction + classification | Background Worker (Celery) | API / Backend | CPU-intensive, deferred; must not block HTTP event loop | +| Health checking | API / Backend | Docker Compose | FastAPI `/health` probes PostgreSQL + MinIO; Compose waits on it | +| Schema migrations | Database / Storage (Alembic + PostgreSQL) | — | DDL-only responsibility; executed before app starts | +| Object key namespacing | API / Backend (service layer) | — | Key construction is a code concern, not a storage concern | +| Service ordering / startup sequencing | CDN / Static (Docker Compose) | — | `depends_on: condition: service_healthy` enforces boot order | +| Connection pooling | API / Backend (SQLAlchemy pool) | Database / Storage | App holds pool; PostgreSQL is the pooled resource | +| Task queue / broker | Background Worker (Redis / Celery) | API / Backend | Broker is Redis; workers are separate Docker Compose services | + +--- + +## Standard Stack + +### Core + +| Library | Version | Purpose | Why Standard | +|---------|---------|---------|--------------| +| `sqlalchemy[asyncio]` | `>=2.0.49` | ORM + async engine + connection pool | Industry standard for Python async PostgreSQL; `create_async_engine` + `async_sessionmaker` pattern is the canonical FastAPI integration | +| `psycopg[binary]` | `>=3.3.4` | PostgreSQL async driver | psycopg v3 (`psycopg`) is SQLAlchemy 2.0's preferred async dialect; `[binary]` provides pre-built wheels with no system dependency on libpq headers | +| `alembic` | `>=1.18.4` | Database migrations | The only maintained migration tool for SQLAlchemy; provides async template (`alembic init -t async`) | +| `minio` | `>=7.2.20` | MinIO / S3 object storage SDK | Official MinIO Python SDK; stable API for `put_object`, `get_object`, `bucket_exists`, `presigned_get_object` | +| `celery[redis]` | `>=5.6.3` | Background task queue + Redis transport | Battle-tested distributed task queue; `[redis]` extra installs `redis` client; replaces per-instance `BackgroundTasks` | +| `redis` | `>=7.4.0` | Redis Python client (Celery dependency + Phase 2 rate limiting) | Official Redis client; installed transitively by `celery[redis]` but worth pinning for Phase 2 rate limiting use | + +### Supporting + +| Library | Version | Purpose | When to Use | +|---------|---------|---------|-------------| +| `pydantic-settings` | `>=2.2` | Env var configuration (already in project) | Extended with new DATABASE_URL, MINIO_*, REDIS_URL vars | +| `anyio` | `>=4.13.0` | Async testing utilities | Required by `httpx` for async test transport in pytest | +| `httpx` | `>=0.28.1` | Async HTTP client for integration tests | Needed to replace `TestClient` (sync) with `AsyncClient` for async route testing | +| `pytest-asyncio` | `>=1.3.0` | Async test runner integration | Already in project as `>=0.23`; upgrade to `>=1.3.0` for `asyncio_mode = auto` support in new async tests | + +### Alternatives Considered + +| Instead of | Could Use | Tradeoff | +|------------|-----------|----------| +| `psycopg[binary]` | `asyncpg` | `asyncpg` is faster in benchmarks but requires a separate sync driver (`psycopg2`) for Alembic. `psycopg` v3 works for both sync (Alembic) and async (FastAPI) with the same URL — zero driver switching | +| `celery[redis]` | `pgqueuer` / `pg_boss` | pgqueuer uses PostgreSQL as the queue (no Redis required). However, the user explicitly selected Celery + Redis. Redis is also needed in Phase 2 for rate limiting, so Redis is justified regardless | +| `minio` Python SDK (sync, wrapped in `asyncio.to_thread`) | `aiobotocore` | MinIO SDK is the official client with full API coverage including MinIO-specific features. `aiobotocore` is AWS-oriented and less tested with MinIO-specific APIs. `to_thread()` wrapping is the correct async pattern for the sync SDK | + +**Installation (backend/requirements.txt additions):** +``` +sqlalchemy[asyncio]>=2.0 +psycopg[binary]>=3.3 +alembic>=1.13 +minio>=7.2 +celery[redis]>=5.4 +redis>=7.0 +httpx>=0.27 +pytest-asyncio>=0.23 +``` + +Note: `psycopg[binary]` is specified with bracket extras in requirements.txt. The binary extra installs a self-contained wheel — no system `libpq-dev` package required in the Docker image, simplifying the Dockerfile. + +--- + +## Package Legitimacy Audit + +All packages verified on PyPI registry via `pip3 index versions` and `slopcheck install` (v0.6.1, run 2026-05-21). + +| Package | Registry | Age | Downloads | Source Repo | slopcheck | Disposition | +|---------|----------|-----|-----------|-------------|-----------|-------------| +| `sqlalchemy` | PyPI | ~20 yrs | Very high (millions/wk) | github.com/sqlalchemy/sqlalchemy | OK | Approved | +| `psycopg` | PyPI | ~4 yrs (v3) | High | github.com/psycopg/psycopg | OK | Approved | +| `alembic` | PyPI | ~12 yrs | Very high | github.com/sqlalchemy/alembic | OK | Approved | +| `minio` | PyPI | ~8 yrs | High | github.com/minio/minio-py | OK | Approved | +| `celery` | PyPI | ~15 yrs | Very high (millions/wk) | github.com/celery/celery | OK | Approved | +| `redis` | PyPI | ~12 yrs | Very high | github.com/redis/redis-py | OK | Approved | + +**Packages removed due to slopcheck [SLOP] verdict:** none +**Packages flagged as suspicious [SUS]:** none + +Note: `psycopg[binary]` is specified with extras syntax in requirements.txt; the installable wheel is `psycopg-binary` on PyPI, which also passed registry verification (version 3.3.4 confirmed). [VERIFIED: PyPI registry + slopcheck OK] + +--- + +## Architecture Patterns + +### System Architecture Diagram + +``` +Browser (Vue 3 SPA — unchanged in Phase 1) + │ HTTP/JSON + multipart (same API contract) + ▼ +FastAPI (port 8000) — lifespan creates async engine, disposes on shutdown + │ + ├── api/documents.py ─── calls ──► services/storage.py (REWRITTEN) + │ │ + │ ├─► db/session.py (AsyncSession) + │ │ │ + │ │ ▼ + │ │ PostgreSQL (port 5432) + │ │ [docuvault_app user, restricted] + │ │ + │ └─► storage/minio_backend.py + │ │ + │ ▼ + │ MinIO (port 9000) + │ [bucket: docuvault] + │ [app-level access key] + │ + ├── /health ─── probes ──► PostgreSQL + MinIO connectivity + │ + └── celery_app.py ─── enqueues tasks ──► Redis (port 6379) + │ + Celery Worker (separate container) + ├── task: extract_and_classify() + │ ├─► services/extractor.py + │ └─► services/classifier.py + └── consumes from Redis queue + +Alembic (run once at deploy time, not part of app startup) + │ uses DATABASE_MIGRATE_URL (docuvault_migrate user, DDL privileges) + └─► PostgreSQL — applies full v1 schema +``` + +### Recommended Project Structure + +``` +backend/ +├── main.py # FastAPI app; extend lifespan for engine/dispose +├── config.py # pydantic-settings: extend with new env vars +├── celery_app.py # Celery app instance (broker from REDIS_URL) +├── db/ +│ ├── __init__.py +│ ├── session.py # async engine + async_sessionmaker +│ └── models.py # all SQLAlchemy ORM models (full v1 schema) +├── deps/ +│ └── db.py # get_db() — yields AsyncSession +├── services/ +│ ├── storage.py # REPLACED: async SQLAlchemy + MinIO SDK +│ ├── extractor.py # unchanged +│ └── classifier.py # update to accept session; dispatch via Celery +├── storage/ # NEW: StorageBackend ABC + MinIO implementation +│ ├── __init__.py # get_storage_backend() factory +│ ├── base.py # StorageBackend ABC (mirrors ai/base.py) +│ └── minio_backend.py # MinIO implementation +├── tasks/ +│ └── document_tasks.py # Celery task definitions (extract_and_classify) +├── migrations/ # Alembic migration directory +│ ├── env.py # async env.py with two-DSN strategy +│ ├── script.py.mako +│ └── versions/ +│ └── 0001_initial_schema.py +├── alembic.ini # sqlalchemy.url = DATABASE_MIGRATE_URL +├── api/ +│ ├── documents.py # update to use async storage service +│ ├── topics.py # unchanged (topics still in DB after migration) +│ └── settings.py # unchanged +└── tests/ + ├── conftest.py # UPDATE: add async engine + session fixtures + ├── test_health.py # UPDATE: test PostgreSQL + MinIO health probes + ├── test_documents.py # UPDATE: adapt for async storage layer + └── test_storage.py # NEW: unit tests for MinIO object key schema +``` + +### Pattern 1: SQLAlchemy 2.0 Async Engine + Session Factory (FastAPI Lifespan) + +**What:** Create engine once at startup, share it application-wide via `app.state`. Session factory (`async_sessionmaker`) yields per-request sessions via a FastAPI dependency. + +**When to use:** Any database access in FastAPI route handlers or services. + +**Example:** +```python +# db/session.py +from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession +from config import settings + +engine = create_async_engine( + settings.database_url, # postgresql+psycopg://docuvault_app:...@postgres/docuvault + pool_pre_ping=True, # detect stale connections before use + echo=False, +) + +AsyncSessionLocal = async_sessionmaker( + engine, + class_=AsyncSession, + expire_on_commit=False, # prevent lazy-load errors after commit +) + +# deps/db.py +from db.session import AsyncSessionLocal + +async def get_db(): + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() + +# main.py — lifespan +from contextlib import asynccontextmanager +from db.session import engine + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Startup: engine creates pool on first connection + yield + # Shutdown: close all pooled connections + await engine.dispose() + +app = FastAPI(lifespan=lifespan) +``` + +**Source:** [CITED: docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html] + +**Key detail — URL format for psycopg v3:** +``` +postgresql+psycopg://user:password@host:port/dbname +``` +The same `postgresql+psycopg://` prefix works for both `create_engine()` (Alembic) and `create_async_engine()` (FastAPI). SQLAlchemy selects the sync or async dialect variant automatically. [CITED: docs.sqlalchemy.org/en/20/dialects/postgresql.html] + +**Key detail — `expire_on_commit=False`:** After `session.commit()`, SQLAlchemy marks all objects as expired and would trigger another SELECT on next attribute access. In async context, this causes `MissingGreenlet` errors because there's no active async context at that point. Setting `expire_on_commit=False` prevents this. [CITED: docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html] + +--- + +### Pattern 2: Alembic Async Configuration with Two DSNs + +**What:** Alembic's async template (`alembic init -t async`) generates `env.py` that uses `async_engine_from_config` and `asyncio.run()`. The `DATABASE_MIGRATE_URL` DSN (DDL privileges) is used only by Alembic; the app uses `DATABASE_URL` (restricted). This separates migration risk from runtime risk. + +**When to use:** Every `alembic upgrade head` call. Never used by FastAPI directly. + +**Example:** +```python +# migrations/env.py (key section — async online migrations) +import asyncio +from sqlalchemy.ext.asyncio import async_engine_from_config +from sqlalchemy import pool +from alembic import context +from db.models import Base # import all models so metadata is populated + +target_metadata = Base.metadata + +def do_run_migrations(connection): + context.configure(connection=connection, target_metadata=target_metadata) + with context.begin_transaction(): + context.run_migrations() + +async def run_async_migrations(): + connectable = async_engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, # migrations use per-run connection, not pool + ) + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + await connectable.dispose() + +def run_migrations_online(): + asyncio.run(run_async_migrations()) +``` + +```ini +# alembic.ini +[alembic] +script_location = migrations +sqlalchemy.url = %(DATABASE_MIGRATE_URL)s # reads from env via %(VAR)s interpolation +``` + +**Two-DSN in practice:** The `alembic.ini` `sqlalchemy.url` references `DATABASE_MIGRATE_URL`. FastAPI's `db/session.py` reads `DATABASE_URL`. Both are set in `.env`. The Docker Compose `backend` service has both env vars; the `celery-worker` service has `DATABASE_URL` only (workers need no DDL). + +**Source:** [CITED: alembic.sqlalchemy.org/en/latest/cookbook.html#using-asyncio-with-alembic] + [CITED: github.com/sqlalchemy/alembic/blob/main/alembic/templates/async/env.py] + +--- + +### Pattern 3: MinIO SDK Sync-in-Async via `asyncio.to_thread()` + +**What:** The MinIO Python SDK is synchronous. In an async FastAPI context, blocking I/O blocks the event loop. Wrap MinIO SDK calls in `asyncio.to_thread()` to offload to a thread pool without blocking. + +**When to use:** All MinIO operations (`put_object`, `get_object`, `bucket_exists`, `presigned_get_object`) called from `async def` handlers or services. + +**Example:** +```python +# storage/minio_backend.py +import asyncio +import io +import uuid +from datetime import timedelta +from minio import Minio +from storage.base import StorageBackend + +class MinIOBackend(StorageBackend): + def __init__(self, endpoint: str, access_key: str, secret_key: str, + bucket: str, secure: bool = False): + self._client = Minio( + endpoint=endpoint, + access_key=access_key, + secret_key=secret_key, + secure=secure, # False for Docker internal network (HTTP) + ) + self._bucket = bucket + + async def put_object( + self, + user_id: str, + document_id: str, + file_bytes: bytes, + extension: str, + content_type: str, + ) -> str: + object_key = f"{user_id}/{document_id}/{uuid.uuid4()}{extension}" + data = io.BytesIO(file_bytes) + await asyncio.to_thread( + self._client.put_object, + self._bucket, + object_key, + data, + length=len(file_bytes), + content_type=content_type, + ) + return object_key + + async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str: + return await asyncio.to_thread( + self._client.presigned_get_object, + bucket_name=self._bucket, + object_name=object_key, + expires=timedelta(minutes=expires_minutes), + ) + + async def health_check(self) -> bool: + try: + return await asyncio.to_thread( + self._client.bucket_exists, self._bucket + ) + except Exception: + return False +``` + +**MinIO `put_object` signature (confirmed):** +```python +client.put_object( + bucket_name: str, + object_name: str, # the object key + data: io.RawIOBase, # io.BytesIO is accepted + length: int, # -1 with part_size for unknown-length streams + content_type: str = "application/octet-stream", +) +``` + +**Note on `length=-1`:** For unknown-length streams, set `length=-1` and `part_size=10*1024*1024`. For in-memory `io.BytesIO`, always pass `length=len(bytes)` — this avoids a multipart upload when not needed. + +**Source:** [CITED: github.com/minio/minio-py/blob/master/docs/API.md] + +--- + +### Pattern 4: MinIO Bucket Initialization at Startup + +**What:** On first `docker compose up`, MinIO starts with an empty state. The application must create the `docuvault` bucket if it doesn't exist. This is done in the FastAPI lifespan, not in user request handlers. + +**Example:** +```python +# main.py lifespan extension +@asynccontextmanager +async def lifespan(app: FastAPI): + # PostgreSQL engine + pool + # MinIO bucket initialization + minio_client = Minio( + settings.minio_endpoint, + access_key=settings.minio_access_key, + secret_key=settings.minio_secret_key, + secure=False, + ) + exists = await asyncio.to_thread(minio_client.bucket_exists, settings.minio_bucket) + if not exists: + await asyncio.to_thread(minio_client.make_bucket, settings.minio_bucket) + app.state.minio = minio_client + yield + await engine.dispose() +``` + +--- + +### Pattern 5: Celery App + Redis Broker Configuration + +**What:** A single `celery_app.py` module defines the Celery application. Tasks are defined as decorated functions. FastAPI route handlers call `.delay()` to enqueue; the celery-worker container processes them. + +**Redis URL format (with password, Docker internal network):** +``` +redis://:${REDIS_PASSWORD}@redis:6379/0 +``` +The `:` before the password with no username is the correct format when Redis is configured with `requirepass` but no ACL users. [CITED: docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html via WebSearch] + +**Example:** +```python +# celery_app.py +import os +from celery import Celery + +celery_app = Celery("docuvault") +celery_app.conf.broker_url = os.environ.get("REDIS_URL", "redis://redis:6379/0") +celery_app.conf.result_backend = os.environ.get("REDIS_URL", "redis://redis:6379/0") +celery_app.conf.task_serializer = "json" +celery_app.conf.result_serializer = "json" +celery_app.conf.accept_content = ["json"] +celery_app.conf.task_routes = { + "tasks.document_tasks.*": {"queue": "documents"}, +} + +# tasks/document_tasks.py +from celery_app import celery_app + +@celery_app.task(name="tasks.document_tasks.extract_and_classify") +def extract_and_classify(document_id: str) -> dict: + # Celery tasks are SYNCHRONOUS functions — do NOT use async def here. + # Use asyncio.run() sparingly or run sync equivalents of extractor/classifier. + from services import extractor, classifier + ... + +# api/documents.py — calling the task +from tasks.document_tasks import extract_and_classify + +@router.post("/upload") +async def upload_document(...): + ... + # Replace: background_tasks.add_task(classifier.classify_document, doc_id) + # With: + extract_and_classify.delay(str(saved_doc.id)) + return meta +``` + +**Critical: Celery tasks are synchronous.** The Celery worker runs a standard Python event loop (not asyncio). Calling `async def` functions inside a Celery task requires `asyncio.run()`, which creates a new event loop per task invocation. This is acceptable for Phase 1 since the existing `extractor.py` and `classifier.py` services already have sync and async entry points, but keep tasks pure-sync where possible. [VERIFIED via WebSearch cross-checked with official docs] + +**Worker startup command:** +``` +celery -A celery_app worker --loglevel=info -Q documents +``` + +--- + +### Pattern 6: Docker Compose Health Checks + `depends_on` + +**What:** Each infrastructure service has a `healthcheck` definition. The `backend` service uses `depends_on: condition: service_healthy` to wait for all three (postgres, minio, redis) before starting. + +**Example:** +```yaml +services: + postgres: + image: postgres:17-alpine + environment: + POSTGRES_DB: docuvault + POSTGRES_USER: postgres + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + volumes: + - postgres_data:/var/lib/postgresql/data + - ./docker/postgres/initdb.d:/docker-entrypoint-initdb.d:ro + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d docuvault"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + minio: + image: minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: ${MINIO_ROOT_USER} + MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD} + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio_data:/data + healthcheck: + # curl is removed from recent MinIO images; use the /minio/health/live HTTP endpoint + # from the host. Inside the container, mc is available: + test: ["CMD", "mc", "ready", "local"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + + redis: + image: redis:7-alpine + command: redis-server --requirepass ${REDIS_PASSWORD} + healthcheck: + test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"] + interval: 10s + timeout: 3s + retries: 5 + + backend: + depends_on: + postgres: + condition: service_healthy + minio: + condition: service_healthy + redis: + condition: service_healthy +``` + +**MinIO healthcheck note:** `curl` was removed from MinIO's Docker image in October 2023. The `mc ready local` command is the current recommended healthcheck inside the container. The `/minio/health/live` HTTP endpoint (returns 200 OK) is still valid for external probing but cannot be used inside the container without curl. [CITED: github.com/minio/minio/issues/18389] + +--- + +### Pattern 7: PostgreSQL Two-User Init Script + +**What:** The official PostgreSQL Docker image runs scripts in `/docker-entrypoint-initdb.d/` on first start (empty volume). A SQL script provisions two users: `docuvault_migrate` (DDL) and `docuvault_app` (runtime, restricted). + +**When to use:** First `docker compose up` with a fresh volume. Idempotent for re-runs is not required — init scripts only run once. + +**Example:** +```sql +-- docker/postgres/initdb.d/01-init-users.sql +-- Runs as the POSTGRES_USER superuser on first container start only. + +-- Migration user: DDL privileges (CREATE TABLE, ALTER TABLE, CREATE INDEX) +CREATE USER docuvault_migrate WITH PASSWORD 'PLACEHOLDER_MIGRATE_PASSWORD'; +GRANT ALL PRIVILEGES ON DATABASE docuvault TO docuvault_migrate; + +-- App user: runtime DML only (SELECT, INSERT, UPDATE, DELETE) +CREATE USER docuvault_app WITH PASSWORD 'PLACEHOLDER_APP_PASSWORD'; +GRANT CONNECT ON DATABASE docuvault TO docuvault_app; + +-- Grant schema-level privileges AFTER migration user creates the schema +-- This must run after alembic upgrade head, OR grant in a second script. +-- Pattern: grant via a post-migration step or grant within the migration itself: +-- GRANT USAGE ON SCHEMA public TO docuvault_app; +-- GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO docuvault_app; +-- ALTER DEFAULT PRIVILEGES IN SCHEMA public +-- GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO docuvault_app; +``` + +**Important:** The `GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES` must be run AFTER Alembic has created the tables, because `ON ALL TABLES` applies only to existing tables. Use `ALTER DEFAULT PRIVILEGES` so future tables (from future migrations) are also accessible. This can be done at the end of the first Alembic migration file, or in a post-migration Docker entrypoint hook. + +**Recommended approach for Phase 1:** Run the GRANT as the last step of the `0001_initial_schema.py` migration using `op.execute()` as the `docuvault_migrate` user (which has full privileges). [ASSUMED — no official doc confirming this is the standard Alembic pattern, but it follows from standard PostgreSQL privilege management] + +--- + +### Pattern 8: StorageBackend ABC (Mirrors `ai/` Pattern) + +**What:** `storage/base.py` defines `StorageBackend` as an abstract base class with the same structure as `ai/base.py`. `storage/__init__.py` provides a `get_storage_backend()` factory. `storage/minio_backend.py` is the Phase 1 implementation. + +**Example:** +```python +# storage/base.py +from abc import ABC, abstractmethod + +class StorageBackend(ABC): + @abstractmethod + async def put_object( + self, user_id: str, document_id: str, + file_bytes: bytes, extension: str, content_type: str, + ) -> str: + """Store object; return the object_key used.""" + + @abstractmethod + async def get_object(self, object_key: str) -> bytes: + """Retrieve object bytes by key.""" + + @abstractmethod + async def delete_object(self, object_key: str) -> None: + """Delete object by key.""" + + @abstractmethod + async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str: + """Return a time-limited download URL.""" + + @abstractmethod + async def health_check(self) -> bool: + """Return True if backend is reachable.""" + +# storage/__init__.py +from config import settings +from storage.minio_backend import MinIOBackend + +def get_storage_backend() -> StorageBackend: + return MinIOBackend( + endpoint=settings.minio_endpoint, + access_key=settings.minio_access_key, + secret_key=settings.minio_secret_key, + bucket=settings.minio_bucket, + secure=False, + ) +``` + +--- + +### Anti-Patterns to Avoid + +- **Sync SQLAlchemy in async context:** Using `create_engine()` instead of `create_async_engine()` in FastAPI will block the event loop on every database call. Use `create_async_engine` throughout. +- **Calling `await session.commit()` then accessing lazy-loaded attributes:** Always set `expire_on_commit=False` or explicitly refresh after commit. +- **Connecting Alembic using `DATABASE_URL` (restricted user):** The restricted `docuvault_app` user has no DDL privileges. Alembic migrations will fail with `permission denied` errors. Alembic must always use `DATABASE_MIGRATE_URL`. +- **Using `async def` for Celery task functions:** Celery workers do not run an asyncio event loop. Define tasks as `def`, not `async def`. Wrap any async calls with `asyncio.run()` if unavoidable, but prefer sync implementations in tasks. +- **Storing human-readable filename as MinIO object key:** Object keys must be UUID-based (`{user_id}/{document_id}/{uuid4()}{ext}`). Filenames are stored ONLY in the `documents.filename` DB column. Putting human filenames in the key enables path traversal and makes key prediction trivial. +- **Using `minio_client.bucket_exists()` inside async handlers without `asyncio.to_thread`:** The MinIO SDK is synchronous; calling it directly from `async def` will block the event loop. +- **MinIO `mc ready local` healthcheck with a password-protected Redis `redis-cli ping`:** For Redis with `requirepass`, the healthcheck must pass `-a $REDIS_PASSWORD` to `redis-cli`. A bare `redis-cli ping` will return `NOAUTH` and be treated as unhealthy. + +--- + +## Don't Hand-Roll + +| Problem | Don't Build | Use Instead | Why | +|---------|-------------|-------------|-----| +| Async PostgreSQL session management | Custom connection/context manager | SQLAlchemy `async_sessionmaker` + `Depends(get_db)` | Handles connection pooling, transaction boundaries, error cleanup, and the `expire_on_commit` edge case | +| Database schema migrations | Manual `CREATE TABLE` scripts in Python | Alembic | Manages migration history, rollbacks, auto-generation from ORM models, and multi-environment DSN configuration | +| MinIO object lifecycle | Custom S3-like HTTP client | `minio` Python SDK | Handles multipart uploads, signature v4, presigned URL expiry, retry logic, and connection pooling | +| Background task distribution | Thread pools or `asyncio.create_task()` | Celery + Redis | Cross-instance task distribution, retry on failure, dead letter queues, task result storage | +| Docker service ordering | `sleep` commands in Compose entrypoints | `healthcheck` + `depends_on: condition: service_healthy` | Deterministic, declarative; `sleep` is a race condition | +| PostgreSQL privilege management | Per-table GRANT scripts written by hand | `ALTER DEFAULT PRIVILEGES` in Alembic migration | Future migrations automatically inherit privileges; hand-written grants go stale | + +**Key insight:** The existing `filelock`-based `services/storage.py` uses at least 6 custom concurrency primitives to solve problems that PostgreSQL's transaction isolation and MinIO's atomic object operations solve at the infrastructure level. The rewrite simplifies the code while gaining correctness guarantees. + +--- + +## Common Pitfalls + +### Pitfall 1: `expire_on_commit=True` (the default) Causes `MissingGreenlet` + +**What goes wrong:** After `await session.commit()`, accessing any ORM object attribute triggers a new SELECT query. In async context, if there is no active session scope, SQLAlchemy raises `sqlalchemy.exc.MissingGreenlet: greenlet_spawn has not been called`. + +**Why it happens:** The default `Session.expire_on_commit=True` marks objects as "expired" post-commit. The next attribute access triggers a lazy load, which needs a sync greenlet context (not available in asyncio). + +**How to avoid:** Always set `expire_on_commit=False` in `async_sessionmaker`. [CITED: docs.sqlalchemy.org] + +**Warning signs:** `MissingGreenlet` in tracebacks after commit; attribute access on model instances outside `async with session` blocks. + +--- + +### Pitfall 2: Alembic `env.py` Not Importing All Models + +**What goes wrong:** `alembic revision --autogenerate` generates an empty migration even though models were defined. + +**Why it happens:** Alembic's `target_metadata` must be set to `Base.metadata`, and all model modules must be imported BEFORE `target_metadata` is accessed in `env.py`. Python only knows about models that have been imported. + +**How to avoid:** In `migrations/env.py`, explicitly import all model modules: +```python +from db import models # noqa: F401 — must import to register with Base.metadata +target_metadata = models.Base.metadata +``` + +**Warning signs:** Empty `op.` blocks in generated migrations; tables not appearing in migration history. + +--- + +### Pitfall 3: MinIO `put_object` Requires `io.BytesIO.seek(0)` Before Use + +**What goes wrong:** `put_object` reads 0 bytes if the `io.BytesIO` object's file pointer is at the end (e.g., after writing to it). + +**Why it happens:** `io.BytesIO.write()` advances the pointer to the end of the data. `put_object` starts reading from the current position. + +**How to avoid:** Always call `data.seek(0)` before passing a `BytesIO` to `put_object`. Or construct the `BytesIO` from the complete bytes directly: `io.BytesIO(file_bytes)` starts the pointer at 0. + +**Warning signs:** MinIO reports successful upload but object is 0 bytes; or `OSError: stream having not enough data`. + +--- + +### Pitfall 4: PostgreSQL Init Script GRANT Timing + +**What goes wrong:** `docuvault_app` user gets `permission denied` on tables even after `GRANT ... ON ALL TABLES`. + +**Why it happens:** `GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public` only applies to tables that exist at the time of the GRANT. Tables created by Alembic after the init script runs are not covered. + +**How to avoid:** Run `ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO docuvault_app;` in the Alembic initial migration (as `docuvault_migrate` user, which owns the tables). This covers all future tables created by the same migration user. + +**Warning signs:** First `docker compose up` works; second run after `alembic upgrade head` fails with 403 DB errors. + +--- + +### Pitfall 5: Redis Healthcheck Without Authentication + +**What goes wrong:** `redis-cli ping` returns `NOAUTH Authentication required` when Redis is started with `requirepass`. Docker Compose treats non-zero exit as unhealthy. Backend never starts. + +**Why it happens:** `redis-cli ping` without `-a` doesn't pass the password. + +**How to avoid:** Use `redis-cli -a ${REDIS_PASSWORD} ping` in the healthcheck `test` field. Note that this logs a warning about passing password on command line — acceptable for a healthcheck, not for production scripts. + +**Warning signs:** `backend` service stuck at `Waiting for redis to be healthy`; `redis-cli ping` showing `NOAUTH` in container logs. + +--- + +### Pitfall 6: MinIO `mc ready local` Healthcheck Not Available Without `mc` + +**What goes wrong:** `mc` is present in the official `minio/minio` Docker image, so `mc ready local` works as a healthcheck. If using a third-party or stripped MinIO image, `mc` may be absent. + +**How to avoid:** Stick to the official `minio/minio:latest` image. If a custom image is needed, use the `/minio/health/live` HTTP endpoint probed from a sidecar or from the host — not from inside the container without curl. + +--- + +### Pitfall 7: Celery Worker Cannot Import FastAPI App Module + +**What goes wrong:** Celery worker Docker container imports `celery_app.py`, which transitively imports the FastAPI app or lifespan, which tries to open database connections or access `app.state`. + +**Why it happens:** Shared imports between the FastAPI app and Celery tasks create circular dependencies at module load time. + +**How to avoid:** Keep `celery_app.py` minimal (Celery configuration only). Task functions in `tasks/` import services directly, not via `main.py` or any router. The Celery worker starts with `celery -A celery_app worker` — it never starts FastAPI. + +--- + +## Code Examples + +### Full v1 SQLAlchemy ORM Schema (Phase 1 Migration Target) + +```python +# db/models.py +import uuid +from datetime import datetime, timezone +from sqlalchemy import ( + Boolean, BigInteger, ForeignKey, Index, String, Text, + TIMESTAMP, UniqueConstraint, Integer +) +from sqlalchemy.dialects.postgresql import UUID, INET, JSONB +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +from sqlalchemy.sql import func + +def now_utc(): + return datetime.now(timezone.utc) + +class Base(DeclarativeBase): + pass + +class User(Base): + __tablename__ = "users" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + handle: Mapped[str] = mapped_column(String, unique=True, nullable=False) + email: Mapped[str] = mapped_column(String, unique=True, nullable=False) + password_hash: Mapped[str] = mapped_column(Text, nullable=False) + totp_secret: Mapped[str | None] = mapped_column(Text, nullable=True) + totp_enabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + role: Mapped[str] = mapped_column(String, nullable=False, default="user") + is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) + ai_provider: Mapped[str | None] = mapped_column(Text, nullable=True) + ai_model: Mapped[str | None] = mapped_column(Text, nullable=True) + default_storage_backend: Mapped[str] = mapped_column(String, nullable=False, default="minio") + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + +class Quota(Base): + __tablename__ = "quotas" + user_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), primary_key=True) + limit_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=104857600) # 100 MB + used_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0) + +class RefreshToken(Base): + __tablename__ = "refresh_tokens" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + token_hash: Mapped[str] = mapped_column(Text, unique=True, nullable=False) + expires_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False) + revoked: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = (Index("ix_refresh_tokens_user_revoked", "user_id", "revoked"),) + +class Folder(Base): + __tablename__ = "folders" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + parent_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("folders.id", ondelete="CASCADE"), nullable=True) + name: Mapped[str] = mapped_column(Text, nullable=False) + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = (UniqueConstraint("user_id", "parent_id", "name"),) + +class Document(Base): + __tablename__ = "documents" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + # user_id is NULLABLE in Phase 1 (D-03); Phase 2 migration adds NOT NULL + user_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=True) + folder_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("folders.id", ondelete="SET NULL"), nullable=True) + filename: Mapped[str] = mapped_column(Text, nullable=False) # original human-readable name + object_key: Mapped[str] = mapped_column(Text, nullable=False) # MinIO key: {user_id}/{doc_id}/{uuid4}{ext} + content_type: Mapped[str] = mapped_column(Text, nullable=False) + size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0) + storage_backend: Mapped[str] = mapped_column(String, nullable=False, default="minio") + extracted_text: Mapped[str | None] = mapped_column(Text, nullable=True) + status: Mapped[str] = mapped_column(String, nullable=False, default="pending") + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + updated_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = ( + Index("ix_documents_user_folder", "user_id", "folder_id"), + Index("ix_documents_user_created", "user_id", "created_at"), + ) + +class Topic(Base): + __tablename__ = "topics" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=True) + name: Mapped[str] = mapped_column(Text, nullable=False) + description: Mapped[str] = mapped_column(Text, nullable=False, default="") + color: Mapped[str] = mapped_column(String(7), nullable=False, default="#6366f1") + __table_args__ = (UniqueConstraint("user_id", "name"),) + +class DocumentTopic(Base): + __tablename__ = "document_topics" + document_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), primary_key=True) + topic_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("topics.id", ondelete="CASCADE"), primary_key=True) + +class Share(Base): + __tablename__ = "shares" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("documents.id", ondelete="CASCADE"), nullable=False) + owner_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + recipient_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + permission: Mapped[str] = mapped_column(String, nullable=False, default="view") + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = ( + UniqueConstraint("document_id", "recipient_id"), + Index("ix_shares_recipient", "recipient_id"), + ) + +class AuditLog(Base): + __tablename__ = "audit_log" + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + user_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="SET NULL"), nullable=True) + actor_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="SET NULL"), nullable=True) + event_type: Mapped[str] = mapped_column(Text, nullable=False) + resource_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True) + ip_address: Mapped[str | None] = mapped_column(INET, nullable=True) + metadata: Mapped[dict | None] = mapped_column(JSONB, nullable=True) + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = ( + Index("ix_audit_user_created", "user_id", "created_at"), + Index("ix_audit_event_created", "event_type", "created_at"), + ) + +class CloudConnection(Base): + __tablename__ = "cloud_connections" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + user_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + provider: Mapped[str] = mapped_column(String, nullable=False) + display_name: Mapped[str] = mapped_column(Text, nullable=False) + credentials_enc: Mapped[str] = mapped_column(Text, nullable=False) + status: Mapped[str] = mapped_column(String, nullable=False, default="ACTIVE") + connected_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) + __table_args__ = (Index("ix_cloud_connections_user", "user_id"),) + +class Group(Base): + """v2 stub — empty table, seeded for schema completeness (PROJECT.md).""" + __tablename__ = "groups" + id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + name: Mapped[str] = mapped_column(Text, unique=True, nullable=False) + created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now()) +``` + +--- + +### Config Extension for New Env Vars + +```python +# config.py (extended) +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + # Existing + data_dir: str = "/app/data" + + # Phase 1 additions + database_url: str = "postgresql+psycopg://docuvault_app:changeme@postgres/docuvault" + database_migrate_url: str = "postgresql+psycopg://docuvault_migrate:changeme@postgres/docuvault" + minio_endpoint: str = "minio:9000" + minio_access_key: str = "docuvault_app" + minio_secret_key: str = "changeme" + minio_bucket: str = "docuvault" + redis_url: str = "redis://:changeme@redis:6379/0" + secret_key: str = "CHANGEME" # documented for Phase 2; not read in Phase 1 + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + +settings = Settings() +``` + +--- + +## State of the Art + +| Old Approach | Current Approach | When Changed | Impact | +|--------------|------------------|--------------|--------| +| `asyncpg` as the only async PostgreSQL dialect | `psycopg` v3 supports both sync + async via one package | 2022 (psycopg v3 release) | Single driver for Alembic + FastAPI; no separate sync/async packages | +| `alembic init` (sync template) | `alembic init -t async` for async engine migrations | Alembic 1.7+ | env.py template pre-configured for asyncio; no manual async wiring | +| `async_sessionmaker` equivalent was `sessionmaker` with separate import | `async_sessionmaker` is a first-class API in SQLAlchemy 2.0 | SQLAlchemy 2.0 (2023) | Cleaner factory pattern without subclassing | +| MinIO Docker image included `curl` for healthchecks | `curl` removed from image; `mc ready local` is the new healthcheck | October 2023 | Existing tutorials with `curl -f` healthcheck will silently fail on current images | +| `FastAPI BackgroundTasks` for async post-request work | Celery + Redis for distributed, reliable task queues | Ongoing | `BackgroundTasks` is per-instance and has no retry; Celery is cross-instance | + +**Deprecated/outdated:** +- `filelock` dependency: can be removed from `backend/requirements.txt` once `services/storage.py` is replaced (CONCERNS.md item 14 identifies the unused `shutil` import; same cleanup applies to `filelock`). +- Per-document `.lock` files in `data/metadata/`: deleted with `data/` directory contents (D-04). +- `psycopg2` (old driver): not installed and not needed; `psycopg` v3 is the replacement. +- Sync file I/O in async handlers (CONCERNS.md item 6): resolved entirely by switching to async SQLAlchemy. + +--- + +## Assumptions Log + +| # | Claim | Section | Risk if Wrong | +|---|-------|---------|---------------| +| A1 | Running `GRANT ... ON ALL TABLES` inside the Alembic initial migration as `docuvault_migrate` is the standard pattern for privilege handoff to `docuvault_app` | Pattern 7 (PostgreSQL init script) | If the migration user lacks permission to GRANT to another user, privileges must be set manually or via a separate script — delays testing | +| A2 | The Celery worker container can import `db/models.py` and `services/` directly without starting FastAPI (no circular import) | Pattern 5 (Celery) | If service modules import FastAPI components at module level, a refactor is needed before worker tasks can import services | +| A3 | `minio/minio:latest` Docker image includes `mc` for the `mc ready local` healthcheck | Pattern 6 (Docker Compose) | If `mc` is not in the image, healthcheck must use a shell-based TCP probe or alternative; confirmed via GitHub issue discussion [CITED: github.com/minio/minio/issues/18389] but version-specific | + +--- + +## Open Questions + +1. **PostgreSQL version to pin in Docker Compose** + - What we know: Any PostgreSQL 14+ supports `gen_random_uuid()`, `JSONB`, `INET`, and `TIMESTAMPTZ` used in the schema. + - What's unclear: Whether to use `postgres:16`, `postgres:17`, or `postgres:17-alpine`. + - Recommendation: Use `postgres:17-alpine` (smallest image, current stable, alpine is well-suited for Docker Compose dev setups). + +2. **MinIO version pinning** + - What we know: `minio/minio:latest` has `mc` available for healthchecks; `curl` was removed in late 2023. + - What's unclear: Whether to pin to a specific release tag (e.g., `RELEASE.2025-09-07T16-13-09Z`) or use `:latest`. + - Recommendation: Pin to a specific RELEASE tag for reproducibility; update as part of a maintenance task. [ASSUMED — no strong official guidance on whether `:latest` is appropriate for production-adjacent Docker Compose] + +3. **Topics table migration: existing topic names from `data/topics.json`** + - What we know: D-04 deletes `data/` contents. Topics stored in `topics.json` are test data and are deleted. + - What's unclear: The existing `api/topics.py` and `frontend/src/stores/topics.js` need updating to read from PostgreSQL instead of the flat file. The API shape should remain the same (list of objects with `id`, `name`, `description`, `color`). + - Recommendation: The planner must include a task for updating `api/topics.py` to use async SQLAlchemy ORM queries against the `topics` table. + +4. **Celery task vs direct service call for text extraction + classification** + - What we know: The current `api/documents.py` calls `await classifier.classify_document()` inside the route handler. This needs to move to a Celery task. + - What's unclear: Whether Phase 1 should move ALL of extraction + classification into a Celery task (full async flow) or just wire up the infrastructure with a placeholder task and migrate the logic in Phase 3. + - Recommendation: Phase 1 should wire the full task (extract + classify) in Celery — the walking skeleton requirement says "AI classification workflow completes successfully." A placeholder task that doesn't classify would fail the success criteria. + +--- + +## Environment Availability + +| Dependency | Required By | Available | Version | Fallback | +|------------|------------|-----------|---------|----------| +| Docker | Docker Compose services | ✓ | 29.5.0 | — | +| Python 3.12 | Backend (in Docker image) | ✓ (host: 3.14.5; Docker: 3.12 pinned) | 3.12 in image | — | +| PostgreSQL (via Docker) | Database tier | ✓ (via Docker) | 17 (image) | — | +| MinIO (via Docker) | Object storage | ✓ (via Docker) | latest | — | +| Redis (via Docker) | Celery broker, Phase 2 rate limiting | ✓ (via Docker) | 7-alpine | — | +| pytest | Backend test runner | ✓ (host pip3) | existing | — | + +**Missing dependencies with no fallback:** None. +**Missing dependencies with fallback:** None. + +--- + +## Validation Architecture + +### Test Framework + +| Property | Value | +|----------|-------| +| Framework | pytest with pytest-asyncio (existing) | +| Config file | `backend/pytest.ini` (existing; `asyncio_mode = auto`) | +| Quick run command | `cd backend && pytest tests/test_health.py tests/test_documents.py tests/test_storage.py -x` | +| Full suite command | `cd backend && pytest -v` | + +### Phase Requirements → Test Map + +| Req ID | Behavior | Test Type | Automated Command | File Exists? | +|--------|----------|-----------|-------------------|-------------| +| STORE-01 | Upload stores metadata in PostgreSQL and bytes in MinIO | integration | `pytest tests/test_documents.py::test_upload_stores_to_postgres_and_minio -x` | ❌ Wave 0 | +| STORE-01 | List documents reads from PostgreSQL (not filesystem) | integration | `pytest tests/test_documents.py::test_list_reads_from_db -x` | ❌ Wave 0 | +| STORE-02 | MinIO object key matches `{user_id}/{document_id}/{uuid4}{ext}` pattern | unit | `pytest tests/test_storage.py::test_object_key_schema -x` | ❌ Wave 0 | +| STORE-02 | Human-readable filename is NOT in the object key | unit | `pytest tests/test_storage.py::test_filename_not_in_object_key -x` | ❌ Wave 0 | +| STORE-07 | `/health` returns PostgreSQL + MinIO connectivity (not just `{"status": "ok"}`) | smoke | `pytest tests/test_health.py::test_health_checks_postgres_and_minio -x` | ❌ Wave 0 | +| STORE-07 (implicit) | Storage service has no file locks; concurrent uploads do not corrupt state | integration | `pytest tests/test_documents.py::test_concurrent_uploads -x` | ❌ Wave 0 | + +### Sampling Rate + +- **Per task commit:** `cd backend && pytest tests/test_health.py tests/test_storage.py -x` +- **Per wave merge:** `cd backend && pytest -v` +- **Phase gate:** Full suite green before `/gsd:verify-work` + +### Wave 0 Gaps + +- [ ] `tests/test_storage.py` — covers STORE-02 (object key schema, filename isolation) +- [ ] `tests/test_documents.py` — extend for PostgreSQL/MinIO-backed upload/list (STORE-01) +- [ ] `tests/test_health.py` — extend for PostgreSQL + MinIO connectivity probes (STORE-07) +- [ ] `tests/conftest.py` — add async engine + session fixtures; add MinIO mock or test bucket fixture +- [ ] Update `tests/conftest.py` to monkeypatch `db/session.py` paths (not just `config.py` paths) + +**Existing tests:** `test_documents.py`, `test_topics.py`, `test_settings.py` test the OLD flat-file storage layer. They will break after `services/storage.py` is replaced. These must be ported (not deleted) as part of Phase 1. + +--- + +## Security Domain + +### Applicable ASVS Categories + +| ASVS Category | Applies | Standard Control | +|---------------|---------|-----------------| +| V2 Authentication | No — Phase 1 has no auth | Phase 2 | +| V3 Session Management | No — Phase 1 has no sessions | Phase 2 | +| V4 Access Control | Partial — object key isolation in MinIO backend | `user_id` prefix enforced in `MinIOBackend.put_object()` | +| V5 Input Validation | Yes — file upload content type + size | Existing `ALLOWED_MIME_TYPES` enforcement (currently unenforced per CONCERNS.md item 1) | +| V6 Cryptography | No — Phase 1 has no credential encryption | Phase 5 | + +### Known Threat Patterns for This Phase + +| Pattern | STRIDE | Standard Mitigation | +|---------|--------|---------------------| +| Object key prediction / path traversal | Tampering | UUID-based object keys (`{user_id}/{document_id}/{uuid4}{ext}`); never accept object keys from request parameters | +| Database superuser credentials in app DSN | Elevation of Privilege | Two-DSN pattern: `docuvault_app` (restricted) for runtime, `docuvault_migrate` (DDL) for Alembic only | +| MinIO credentials with bucket admin rights | Elevation of Privilege | App-level access key pair (`MINIO_ACCESS_KEY` / `MINIO_SECRET_KEY`) with read/write on `docuvault` bucket only; root credentials not used by app | +| Redis unauthenticated in Docker network | Information Disclosure | `requirepass` set on Redis; `REDIS_URL` includes password; Celery broker and app use authenticated URL | +| SQL injection via ORM | Tampering | SQLAlchemy ORM / parameterized queries throughout; zero raw string interpolation (matches CLAUDE.md SEC-03) | +| Sensitive data in MinIO object key | Information Disclosure | Human-readable filenames stored in DB only; object key is UUID-based and non-predictable | + +--- + +## Sources + +### Primary (HIGH confidence) +- [docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html](https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html) — async engine setup, `async_sessionmaker`, `expire_on_commit=False`, FastAPI lifespan integration +- [alembic.sqlalchemy.org/en/latest/cookbook.html#using-asyncio-with-alembic](https://alembic.sqlalchemy.org/en/latest/cookbook.html) — async `env.py` pattern +- [github.com/sqlalchemy/alembic/blob/main/alembic/templates/async/env.py](https://github.com/sqlalchemy/alembic/blob/main/alembic/templates/async/env.py) — official async env.py template code +- [github.com/minio/minio-py/blob/master/docs/API.md](https://github.com/minio/minio-py/blob/master/docs/API.md) — `put_object`, `presigned_get_object`, constructor signatures +- [github.com/minio/minio/issues/18389](https://github.com/minio/minio/issues/18389) — `curl` removal from MinIO image; `mc ready local` as replacement +- [docs.min.io/enterprise/aistor-object-store/operations/monitoring/healthcheck-probe/](https://docs.min.io/enterprise/aistor-object-store/operations/monitoring/healthcheck-probe/) — `/minio/health/live` endpoint documented +- [docs.docker.com/reference/compose-file/services/#healthcheck](https://docs.docker.com/reference/compose-file/services/#healthcheck) — `healthcheck` + `depends_on: condition: service_healthy` syntax + +### Secondary (MEDIUM confidence) +- [docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html](https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html) — Redis URL format verified via WebSearch; Celery docs site was unreachable during research session +- [testdriven.io/blog/fastapi-and-celery/](https://testdriven.io/blog/fastapi-and-celery/) — Celery + FastAPI project structure and `.delay()` pattern +- WebSearch results cross-referenced with official docs for psycopg install extras, Redis broker URL format, PostgreSQL init script pattern + +### Tertiary (LOW confidence) +- None — all key claims cross-verified with at least one authoritative source + +--- + +## Metadata + +**Confidence breakdown:** +- Standard stack: HIGH — all packages verified on PyPI via `pip3 index versions`, slopcheck [OK] for all 6 core packages +- Architecture: HIGH — patterns drawn from SQLAlchemy official docs, Alembic official template, and MinIO official GitHub +- Pitfalls: HIGH — each pitfall sourced from official documentation or confirmed GitHub issues (not community blog posts only) +- Celery configuration: MEDIUM — Celery docs site was unreachable; URL format cross-verified via WebSearch + community sources + +**Research date:** 2026-05-21 +**Valid until:** 2026-06-21 for stable stack; MinIO healthcheck pattern should be re-verified if the Docker image version changes significantly diff --git a/.planning/phases/01-infrastructure-foundation/01-VALIDATION.md b/.planning/phases/01-infrastructure-foundation/01-VALIDATION.md new file mode 100644 index 0000000..2eb4b7c --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/01-VALIDATION.md @@ -0,0 +1,82 @@ +--- +phase: 1 +slug: infrastructure-foundation +status: draft +nyquist_compliant: false +wave_0_complete: false +created: 2026-05-21 +--- + +# Phase 1 — Validation Strategy + +> Per-phase validation contract for feedback sampling during execution. + +--- + +## Test Infrastructure + +| Property | Value | +|----------|-------| +| **Framework** | pytest 7.x (asyncio_mode = auto) | +| **Config file** | `backend/pytest.ini` | +| **Quick run command** | `cd backend && pytest tests/test_health.py -v` | +| **Full suite command** | `cd backend && pytest -v` | +| **Estimated runtime** | ~30 seconds | + +--- + +## Sampling Rate + +- **After every task commit:** Run `cd backend && pytest tests/test_health.py -v` +- **After every plan wave:** Run `cd backend && pytest -v` +- **Before `/gsd:verify-work`:** Full suite must be green +- **Max feedback latency:** 30 seconds + +--- + +## Per-Task Verification Map + +| Task ID | Plan | Wave | Requirement | Threat Ref | Secure Behavior | Test Type | Automated Command | File Exists | Status | +|---------|------|------|-------------|------------|-----------------|-----------|-------------------|-------------|--------| +| 1-01-01 | 01 | 1 | STORE-01 | — | N/A | unit | `cd backend && pytest tests/test_health.py -v` | ❌ W0 | ⬜ pending | +| 1-01-02 | 01 | 1 | STORE-01 | — | MinIO key never contains human filename | unit | `cd backend && pytest tests/test_storage.py::test_object_key_format -v` | ❌ W0 | ⬜ pending | +| 1-02-01 | 02 | 1 | STORE-01 | — | N/A | unit | `cd backend && pytest tests/test_alembic.py -v` | ❌ W0 | ⬜ pending | +| 1-02-02 | 02 | 1 | STORE-02 | — | Object key is UUID-based, not filename | unit | `cd backend && pytest tests/test_storage.py::test_minio_key_schema -v` | ❌ W0 | ⬜ pending | +| 1-03-01 | 03 | 2 | STORE-07 | — | No file locks; multiple workers can run | integration | `cd backend && pytest tests/test_documents.py -v` | ❌ W0 | ⬜ pending | +| 1-03-02 | 03 | 2 | STORE-01 | — | End-to-end upload works with new storage layer | integration | `cd backend && pytest tests/test_documents.py::test_upload_end_to_end -v` | ❌ W0 | ⬜ pending | + +*Status: ⬜ pending · ✅ green · ❌ red · ⚠️ flaky* + +--- + +## Wave 0 Requirements + +- [ ] `tests/test_health.py` — rewrite to probe PostgreSQL + MinIO endpoints (not just `{"status": "ok"}`) +- [ ] `tests/test_storage.py` — new file: unit tests for `StorageBackend` ABC, MinIO key schema (STORE-02), and object CRUD +- [ ] `tests/test_alembic.py` — new file: verify migration applies cleanly, all tables exist, columns match schema +- [ ] `tests/test_documents.py` — rewrite to use PostgreSQL + MinIO storage layer (existing tests are coupled to flat-file storage) +- [ ] `tests/conftest.py` — add async SQLAlchemy test engine fixture (in-memory SQLite for unit tests, real PostgreSQL for integration) + +--- + +## Manual-Only Verifications + +| Behavior | Requirement | Why Manual | Test Instructions | +|----------|-------------|------------|-------------------| +| `docker compose up` starts all services cleanly | STORE-01 | Requires Docker daemon | Run `docker compose up --build` and confirm all health checks pass in the `docker ps` output | +| `alembic upgrade head` completes with no errors | STORE-01 | Requires live PostgreSQL | Run `cd backend && alembic upgrade head` against the Docker PostgreSQL instance; confirm zero errors and all tables created | +| Celery worker starts and processes a task | STORE-07 | Requires running Redis + worker | Trigger a document upload; confirm extraction + classification completes via Celery (check worker logs) | +| Existing document upload workflow works end-to-end | STORE-01 | Full integration | Upload a PDF through the UI; confirm it appears in the document list with extracted text and AI classification | + +--- + +## Validation Sign-Off + +- [ ] All tasks have `` verify or Wave 0 dependencies +- [ ] Sampling continuity: no 3 consecutive tasks without automated verify +- [ ] Wave 0 covers all MISSING references +- [ ] No watch-mode flags +- [ ] Feedback latency < 30s +- [ ] `nyquist_compliant: true` set in frontmatter + +**Approval:** pending diff --git a/.planning/phases/01-infrastructure-foundation/SKELETON.md b/.planning/phases/01-infrastructure-foundation/SKELETON.md new file mode 100644 index 0000000..0bdb79a --- /dev/null +++ b/.planning/phases/01-infrastructure-foundation/SKELETON.md @@ -0,0 +1,55 @@ +# Walking Skeleton — DocuVault + +**Phase:** 1 +**Generated:** 2026-05-21 + +## Capability Proven End-to-End + +A real document upload (PDF or TXT) sent to `POST /api/documents/upload` running inside `docker compose up` persists its metadata in PostgreSQL, stores its bytes in MinIO under the `docuvault` bucket using a `{user_id|null-marker}/{document_id}/{uuid4()}{ext}` object key, enqueues a Celery task on Redis that performs text extraction and AI classification, and returns the document JSON — with `GET /health` simultaneously reporting `postgres: ok` and `minio: ok`. + +## Architectural Decisions + +| Decision | Choice | Rationale | +|---|---|---| +| Backend framework | FastAPI 0.111+ on Python 3.12 (existing) | Already in use; async ecosystem matches PostgreSQL + MinIO + Celery | +| ORM / async DB driver | SQLAlchemy 2.0 async + `psycopg[binary]` v3 (`postgresql+psycopg://` URL prefix) | Single driver works for both Alembic (sync) and FastAPI (async); RESEARCH.md Pattern 1 | +| Migrations | Alembic async template (`alembic init -t async`); two DSNs — `DATABASE_URL` (app, restricted) + `DATABASE_MIGRATE_URL` (DDL); `expire_on_commit=False` on `async_sessionmaker` | D-13, D-14; RESEARCH.md Pattern 2; Pitfall 1 | +| Object storage | MinIO official Python SDK wrapped in `asyncio.to_thread()`; single bucket `docuvault`; UUID-based keys | D-06; RESEARCH.md Pattern 3; STORE-02 | +| Background queue | Celery 5.4+ with Redis broker + result backend; sync `def` tasks (no `async def` for tasks) | D-08, D-10; RESEARCH.md Pattern 5; replaces FastAPI BackgroundTasks per STORE-08 | +| Storage abstraction | `StorageBackend` ABC + `get_storage_backend()` factory in `backend/storage/` mirroring `backend/ai/base.py` + `backend/ai/__init__.py` | Established project pattern; CLOUD-07 forward-compatible | +| Secrets / config | Pydantic Settings reading `.env` in dev; `env_file: /etc/docuvault/env` in prod (`chmod 600`); `.env.example` committed with safe placeholders | D-11, D-12 | +| Service ordering | Docker Compose `healthcheck` + `depends_on: condition: service_healthy` for postgres / minio / redis; `mc ready local` for MinIO; `redis-cli -a $REDIS_PASSWORD ping` for Redis | RESEARCH.md Pattern 6 + Pitfall 5 | +| Directory layout | `backend/db/` (models, session), `backend/deps/` (FastAPI deps), `backend/storage/` (object backend ABC + impls), `backend/tasks/` (Celery tasks), `backend/migrations/` (Alembic) | RESEARCH.md "Recommended Project Structure" | +| Deployment | Local `docker compose up` only in Phase 1; production deploy target deferred to a later phase | Phase 1 success criterion is single-command local boot | + +## Stack Touched in Phase 1 + +- [x] Project scaffold — Python 3.12 backend container, Dockerfile unchanged +- [x] Routing — `GET /health` (extended) + `POST /api/documents/upload` (rewired) + `GET /api/documents` (rewired) +- [x] Database — Alembic migration creates full v1 schema (10 tables incl. `groups` stub per D-02); upload writes a `documents` row, list reads documents +- [x] Object storage — MinIO bucket auto-created on app startup; upload writes object, key matches `{user_id|null-marker}/{document_id}/{uuid4()}{ext}` +- [x] Background worker — Celery worker container running; upload enqueues `tasks.document_tasks.extract_and_classify`; result observable in worker logs +- [x] Deployment — single `docker compose up` boots PostgreSQL + MinIO + Redis + backend + celery-worker; all health checks green + +## Out of Scope (Deferred to Later Slices) + +These are intentionally NOT in Phase 1 — later phases must not re-litigate this minimalism. + +- Users, authentication, registration, JWT, refresh tokens, TOTP, password reset — Phase 2 (`documents.user_id` is nullable in Phase 1 per D-03) +- Multi-user isolation enforcement (per-row ownership checks, presigned URL flow) — Phase 3 +- Per-user 100 MB quota enforcement (`UPDATE quotas ... RETURNING used_bytes`) — Phase 3 (`quotas` table exists per D-01 but has no rows and no constraint code path) +- Frontend changes — none; the Vue 3 SPA must continue to call the existing endpoint shapes +- Folders / sharing / search / PDF preview — Phase 4 +- Cloud storage backends (OneDrive, Google Drive, Nextcloud, WebDAV) — Phase 5 (`cloud_connections` table exists per D-01 but has no implementation) +- Admin endpoints, audit log writes, CSRF, CSP headers, rate limiting — Phase 2 +- Existing flat-file data migration script — explicitly skipped (D-04: `data/` directory is deleted; test data only) +- Production deployment target / CD pipeline — deferred + +## Subsequent Slice Plan + +Each later phase adds one vertical slice on top of this skeleton without altering its architectural decisions: + +- **Phase 2:** Users register / log in / enroll TOTP / reset password / sign out all — adds `NOT NULL` constraint to `documents.user_id`, populates `users` + `refresh_tokens`, adds JWT middleware + CSRF + rate limiting + security headers, plus admin user management +- **Phase 3:** All documents owned by a user; presigned URL flow for downloads; atomic 100 MB quota enforced via the `quotas` table seeded in Phase 1; admin-assigned AI provider/model used for classification +- **Phase 4:** Folder CRUD + document move; share by handle; full-text search via PostgreSQL `tsvector`; in-browser PDF preview proxied through the app; admin audit log viewer +- **Phase 5:** `StorageBackend` ABC gains OneDrive / Google Drive / Nextcloud / WebDAV implementations; HKDF per-user credential encryption populates `cloud_connections.credentials_enc`