Add PDF document service with AI extraction and per-app settings

- New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 05:28:11 +02:00
parent d423bea134
commit 0d34867a69
52 changed files with 2500 additions and 28 deletions
@@ -0,0 +1,55 @@
+import asyncio
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy.ext.asyncio import create_async_engine
+
+from app.core.config import settings
+from app.database import Base
+import app.models  # noqa: F401 — registers Document, DocumentCategory, CategoryAssignment
+
+config = context.config
+config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
+
+if config.config_file_name:
+    fileConfig(config.config_file_name)
+
+target_metadata = Base.metadata
+
+# Separate version table — must not collide with the main backend's alembic_version table.
+VERSION_TABLE = "alembic_version_doc_service"
+
+
+def run_migrations_offline():
+    context.configure(
+        url=settings.DATABASE_URL,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+        version_table=VERSION_TABLE,
+    )
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def do_run_migrations(connection):
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,
+        version_table=VERSION_TABLE,
+    )
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+async def run_migrations_online():
+    engine = create_async_engine(settings.DATABASE_URL)
+    async with engine.connect() as conn:
+        await conn.run_sync(do_run_migrations)
+    await engine.dispose()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    asyncio.run(run_migrations_online())
@@ -0,0 +1,25 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
@@ -0,0 +1,79 @@
+"""create document tables
+
+Revision ID: 0001
+Revises:
+Create Date: 2026-04-14
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+revision: str = "0001"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "documents",
+        sa.Column("id", sa.String(), primary_key=True),
+        sa.Column("user_id", sa.String(), nullable=False),
+        sa.Column("filename", sa.String(), nullable=False),
+        sa.Column("file_path", sa.String(), nullable=False),
+        sa.Column("file_size", sa.Integer(), nullable=False),
+        sa.Column("status", sa.String(), nullable=False),
+        sa.Column("document_type", sa.String(), nullable=True),
+        sa.Column("raw_text", sa.Text(), nullable=True),
+        sa.Column("extracted_data", sa.Text(), nullable=True),
+        sa.Column("tags", sa.Text(), nullable=True),
+        sa.Column("error_message", sa.String(500), nullable=True),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.Column("processed_at", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.create_index("ix_documents_user_id", "documents", ["user_id"])
+
+    op.create_table(
+        "document_categories",
+        sa.Column("id", sa.String(), primary_key=True),
+        sa.Column("user_id", sa.String(), nullable=False),
+        sa.Column("name", sa.String(128), nullable=False),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+    )
+    op.create_index("ix_document_categories_user_id", "document_categories", ["user_id"])
+
+    op.create_table(
+        "document_category_assignments",
+        sa.Column(
+            "document_id",
+            sa.String(),
+            sa.ForeignKey("documents.id", ondelete="CASCADE"),
+            primary_key=True,
+        ),
+        sa.Column(
+            "category_id",
+            sa.String(),
+            sa.ForeignKey("document_categories.id", ondelete="CASCADE"),
+            primary_key=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("document_category_assignments")
+    op.drop_index("ix_document_categories_user_id", "document_categories")
+    op.drop_table("document_categories")
+    op.drop_index("ix_documents_user_id", "documents")
+    op.drop_table("documents")