Add PDF document service with AI extraction and per-app settings

- New `features/doc-service` FastAPI microservice: PDF upload, async
  text extraction (pdfplumber), AI classification via Anthropic/Ollama/
  LM Studio, per-user categories, file download
- Alembic migration isolated with `alembic_version_doc_service` table
- Main backend: httpx proxy routers for /api/documents/* and
  /api/documents/categories/*, admin settings API at /api/settings/*
- Runtime config in /config/doc_service_config.json (shared Docker
  volume); api_key masking on reads; atomic write with os.replace()
- Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage
  launcher hub, simplified Nav (removed Settings link), new routes
- docker-compose: doc-service service, doc_data + app_config volumes,
  removed internal:true from backend-net for outbound AI API calls
- Fix pre-commit hook: probe Docker socket path so git subprocess picks
  up Docker Desktop on macOS
- Fix security_check.py: use sys.executable for bandit so venv python
  is used instead of system python

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-14 05:28:11 +02:00
parent d423bea134
commit 0d34867a69
52 changed files with 2500 additions and 28 deletions
+55
View File
@@ -0,0 +1,55 @@
import asyncio
from logging.config import fileConfig
from alembic import context
from sqlalchemy.ext.asyncio import create_async_engine
from app.core.config import settings
from app.database import Base
import app.models # noqa: F401 — registers Document, DocumentCategory, CategoryAssignment
config = context.config
config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
if config.config_file_name:
fileConfig(config.config_file_name)
target_metadata = Base.metadata
# Separate version table — must not collide with the main backend's alembic_version table.
VERSION_TABLE = "alembic_version_doc_service"
def run_migrations_offline():
context.configure(
url=settings.DATABASE_URL,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
version_table=VERSION_TABLE,
)
with context.begin_transaction():
context.run_migrations()
def do_run_migrations(connection):
context.configure(
connection=connection,
target_metadata=target_metadata,
version_table=VERSION_TABLE,
)
with context.begin_transaction():
context.run_migrations()
async def run_migrations_online():
engine = create_async_engine(settings.DATABASE_URL)
async with engine.connect() as conn:
await conn.run_sync(do_run_migrations)
await engine.dispose()
if context.is_offline_mode():
run_migrations_offline()
else:
asyncio.run(run_migrations_online())
@@ -0,0 +1,25 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}
@@ -0,0 +1,79 @@
"""create document tables
Revision ID: 0001
Revises:
Create Date: 2026-04-14
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
revision: str = "0001"
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"documents",
sa.Column("id", sa.String(), primary_key=True),
sa.Column("user_id", sa.String(), nullable=False),
sa.Column("filename", sa.String(), nullable=False),
sa.Column("file_path", sa.String(), nullable=False),
sa.Column("file_size", sa.Integer(), nullable=False),
sa.Column("status", sa.String(), nullable=False),
sa.Column("document_type", sa.String(), nullable=True),
sa.Column("raw_text", sa.Text(), nullable=True),
sa.Column("extracted_data", sa.Text(), nullable=True),
sa.Column("tags", sa.Text(), nullable=True),
sa.Column("error_message", sa.String(500), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column("processed_at", sa.DateTime(timezone=True), nullable=True),
)
op.create_index("ix_documents_user_id", "documents", ["user_id"])
op.create_table(
"document_categories",
sa.Column("id", sa.String(), primary_key=True),
sa.Column("user_id", sa.String(), nullable=False),
sa.Column("name", sa.String(128), nullable=False),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
op.create_index("ix_document_categories_user_id", "document_categories", ["user_id"])
op.create_table(
"document_category_assignments",
sa.Column(
"document_id",
sa.String(),
sa.ForeignKey("documents.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column(
"category_id",
sa.String(),
sa.ForeignKey("document_categories.id", ondelete="CASCADE"),
primary_key=True,
),
)
def downgrade() -> None:
op.drop_table("document_category_assignments")
op.drop_index("ix_document_categories_user_id", "document_categories")
op.drop_table("document_categories")
op.drop_index("ix_documents_user_id", "documents")
op.drop_table("documents")