Add PDF document service with AI extraction and per-app settings
- New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
DATABASE_URL=postgresql+asyncpg://postgres:password@db:5432/destroying_sap
|
||||
DATA_DIR=/data/documents
|
||||
CONFIG_PATH=/config/doc_service_config.json
|
||||
@@ -0,0 +1,34 @@
|
||||
# ── Stage 1: dependency installation ─────────────────────────────────────────
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
COPY pyproject.toml .
|
||||
RUN pip install --prefix=/install .
|
||||
|
||||
# ── Stage 2: runtime ──────────────────────────────────────────────────────────
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Create non-root user (UID/GID 1001)
|
||||
RUN groupadd --gid 1001 appuser && \
|
||||
useradd --uid 1001 --gid 1001 --no-create-home --shell /bin/sh appuser
|
||||
|
||||
# Pre-create data and config dirs with correct ownership.
|
||||
# Named volumes mounted over these paths will inherit ownership on first creation.
|
||||
RUN mkdir -p /data/documents /config && chown -R appuser:appuser /data /config
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=builder /install /usr/local
|
||||
COPY --chown=appuser:appuser app ./app
|
||||
COPY --chown=appuser:appuser alembic ./alembic
|
||||
COPY --chown=appuser:appuser alembic.ini .
|
||||
COPY --chown=appuser:appuser scripts ./scripts
|
||||
|
||||
USER appuser
|
||||
|
||||
EXPOSE 8001
|
||||
|
||||
CMD ["sh", "scripts/start.sh"]
|
||||
@@ -0,0 +1,45 @@
|
||||
[alembic]
|
||||
script_location = alembic
|
||||
prepend_sys_path = .
|
||||
version_path_separator = os
|
||||
sqlalchemy.url = postgresql+asyncpg://postgres:password@localhost:5432/destroying_sap
|
||||
|
||||
# Use a separate version table so this service's migrations don't collide
|
||||
# with the main backend's alembic_version table in the shared postgres instance.
|
||||
version_table = alembic_version_doc_service
|
||||
|
||||
[post_write_hooks]
|
||||
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
||||
@@ -0,0 +1,55 @@
|
||||
import asyncio
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
|
||||
from app.core.config import settings
|
||||
from app.database import Base
|
||||
import app.models # noqa: F401 — registers Document, DocumentCategory, CategoryAssignment
|
||||
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
|
||||
|
||||
if config.config_file_name:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
target_metadata = Base.metadata
|
||||
|
||||
# Separate version table — must not collide with the main backend's alembic_version table.
|
||||
VERSION_TABLE = "alembic_version_doc_service"
|
||||
|
||||
|
||||
def run_migrations_offline():
|
||||
context.configure(
|
||||
url=settings.DATABASE_URL,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
version_table=VERSION_TABLE,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def do_run_migrations(connection):
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
version_table=VERSION_TABLE,
|
||||
)
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
async def run_migrations_online():
|
||||
engine = create_async_engine(settings.DATABASE_URL)
|
||||
async with engine.connect() as conn:
|
||||
await conn.run_sync(do_run_migrations)
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
asyncio.run(run_migrations_online())
|
||||
@@ -0,0 +1,25 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: Union[str, None] = ${repr(down_revision)}
|
||||
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
||||
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
@@ -0,0 +1,79 @@
|
||||
"""create document tables
|
||||
|
||||
Revision ID: 0001
|
||||
Revises:
|
||||
Create Date: 2026-04-14
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision: str = "0001"
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"documents",
|
||||
sa.Column("id", sa.String(), primary_key=True),
|
||||
sa.Column("user_id", sa.String(), nullable=False),
|
||||
sa.Column("filename", sa.String(), nullable=False),
|
||||
sa.Column("file_path", sa.String(), nullable=False),
|
||||
sa.Column("file_size", sa.Integer(), nullable=False),
|
||||
sa.Column("status", sa.String(), nullable=False),
|
||||
sa.Column("document_type", sa.String(), nullable=True),
|
||||
sa.Column("raw_text", sa.Text(), nullable=True),
|
||||
sa.Column("extracted_data", sa.Text(), nullable=True),
|
||||
sa.Column("tags", sa.Text(), nullable=True),
|
||||
sa.Column("error_message", sa.String(500), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("processed_at", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.create_index("ix_documents_user_id", "documents", ["user_id"])
|
||||
|
||||
op.create_table(
|
||||
"document_categories",
|
||||
sa.Column("id", sa.String(), primary_key=True),
|
||||
sa.Column("user_id", sa.String(), nullable=False),
|
||||
sa.Column("name", sa.String(128), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
op.create_index("ix_document_categories_user_id", "document_categories", ["user_id"])
|
||||
|
||||
op.create_table(
|
||||
"document_category_assignments",
|
||||
sa.Column(
|
||||
"document_id",
|
||||
sa.String(),
|
||||
sa.ForeignKey("documents.id", ondelete="CASCADE"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column(
|
||||
"category_id",
|
||||
sa.String(),
|
||||
sa.ForeignKey("document_categories.id", ondelete="CASCADE"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("document_category_assignments")
|
||||
op.drop_index("ix_document_categories_user_id", "document_categories")
|
||||
op.drop_table("document_categories")
|
||||
op.drop_index("ix_documents_user_id", "documents")
|
||||
op.drop_table("documents")
|
||||
@@ -0,0 +1,14 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
PROJECT_NAME: str = "doc-service"
|
||||
DATABASE_URL: str = "postgresql+asyncpg://postgres:password@db:5432/destroying_sap"
|
||||
DATA_DIR: str = "/data/documents"
|
||||
CONFIG_PATH: str = "/config/doc_service_config.json"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
@@ -0,0 +1,16 @@
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
engine = create_async_engine(settings.DATABASE_URL, echo=False)
|
||||
AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db() -> AsyncSession:
|
||||
async with AsyncSessionLocal() as session:
|
||||
yield session
|
||||
@@ -0,0 +1,12 @@
|
||||
from fastapi import Header, HTTPException
|
||||
|
||||
|
||||
async def get_user_id(x_user_id: str = Header(...)) -> str:
|
||||
"""
|
||||
Extract the user identity injected by the main backend proxy.
|
||||
The main backend validates the JWT and forwards the user ID via this header.
|
||||
Doc-service trusts it because it is only reachable from backend on backend-net.
|
||||
"""
|
||||
if not x_user_id:
|
||||
raise HTTPException(status_code=400, detail="Missing X-User-Id header")
|
||||
return x_user_id
|
||||
@@ -0,0 +1,17 @@
|
||||
from fastapi import FastAPI
|
||||
|
||||
from app.core.config import settings
|
||||
from app.routers import categories, documents
|
||||
|
||||
app = FastAPI(title=settings.PROJECT_NAME)
|
||||
|
||||
# No CORS — this service is only reachable from the main backend on backend-net.
|
||||
# All browser traffic goes through the main backend proxy.
|
||||
|
||||
app.include_router(documents.router, prefix="/documents", tags=["documents"])
|
||||
app.include_router(categories.router, prefix="/categories", tags=["categories"])
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
@@ -0,0 +1,5 @@
|
||||
from app.models.document import Document
|
||||
from app.models.category import DocumentCategory
|
||||
from app.models.category_assignment import CategoryAssignment
|
||||
|
||||
__all__ = ["Document", "DocumentCategory", "CategoryAssignment"]
|
||||
@@ -0,0 +1,22 @@
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import DateTime, String, func
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class DocumentCategory(Base):
|
||||
__tablename__ = "document_categories"
|
||||
|
||||
id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
user_id: Mapped[str] = mapped_column(String, nullable=False, index=True)
|
||||
name: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
assignments: Mapped[list["CategoryAssignment"]] = relationship(
|
||||
"CategoryAssignment", back_populates="category", cascade="all, delete-orphan"
|
||||
)
|
||||
@@ -0,0 +1,20 @@
|
||||
from sqlalchemy import ForeignKey, String
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class CategoryAssignment(Base):
|
||||
__tablename__ = "document_category_assignments"
|
||||
|
||||
document_id: Mapped[str] = mapped_column(
|
||||
String, ForeignKey("documents.id", ondelete="CASCADE"), primary_key=True
|
||||
)
|
||||
category_id: Mapped[str] = mapped_column(
|
||||
String, ForeignKey("document_categories.id", ondelete="CASCADE"), primary_key=True
|
||||
)
|
||||
|
||||
document: Mapped["Document"] = relationship("Document", back_populates="category_assignments")
|
||||
category: Mapped["DocumentCategory"] = relationship(
|
||||
"DocumentCategory", back_populates="assignments"
|
||||
)
|
||||
@@ -0,0 +1,31 @@
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import DateTime, Integer, String, Text, func
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Document(Base):
|
||||
__tablename__ = "documents"
|
||||
|
||||
id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
|
||||
user_id: Mapped[str] = mapped_column(String, nullable=False, index=True)
|
||||
filename: Mapped[str] = mapped_column(String, nullable=False)
|
||||
file_path: Mapped[str] = mapped_column(String, nullable=False)
|
||||
file_size: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
status: Mapped[str] = mapped_column(String, nullable=False, default="pending")
|
||||
document_type: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
raw_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
extracted_data: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON string
|
||||
tags: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON array string
|
||||
error_message: Mapped[str | None] = mapped_column(String(500), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
|
||||
category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
|
||||
"CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
|
||||
)
|
||||
@@ -0,0 +1,80 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.database import get_db
|
||||
from app.deps import get_user_id
|
||||
from app.models.category import DocumentCategory
|
||||
from app.schemas.category import CategoryCreate, CategoryOut, CategoryUpdate
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("", response_model=list[CategoryOut])
|
||||
async def list_categories(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> list[DocumentCategory]:
|
||||
result = await db.execute(
|
||||
select(DocumentCategory)
|
||||
.where(DocumentCategory.user_id == user_id)
|
||||
.order_by(DocumentCategory.name)
|
||||
)
|
||||
return result.scalars().all()
|
||||
|
||||
|
||||
@router.post("", response_model=CategoryOut, status_code=201)
|
||||
async def create_category(
|
||||
body: CategoryCreate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> DocumentCategory:
|
||||
name = body.name.strip()
|
||||
if not name:
|
||||
raise HTTPException(status_code=422, detail="Category name cannot be empty")
|
||||
cat = DocumentCategory(user_id=user_id, name=name[:128])
|
||||
db.add(cat)
|
||||
await db.commit()
|
||||
await db.refresh(cat)
|
||||
return cat
|
||||
|
||||
|
||||
@router.patch("/{cat_id}", response_model=CategoryOut)
|
||||
async def rename_category(
|
||||
cat_id: str,
|
||||
body: CategoryUpdate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> DocumentCategory:
|
||||
cat = await _get_user_cat(cat_id, user_id, db)
|
||||
name = body.name.strip()
|
||||
if not name:
|
||||
raise HTTPException(status_code=422, detail="Category name cannot be empty")
|
||||
cat.name = name[:128]
|
||||
await db.commit()
|
||||
await db.refresh(cat)
|
||||
return cat
|
||||
|
||||
|
||||
@router.delete("/{cat_id}", status_code=204)
|
||||
async def delete_category(
|
||||
cat_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
cat = await _get_user_cat(cat_id, user_id, db)
|
||||
await db.delete(cat)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def _get_user_cat(cat_id: str, user_id: str, db: AsyncSession) -> DocumentCategory:
|
||||
result = await db.execute(
|
||||
select(DocumentCategory).where(
|
||||
DocumentCategory.id == cat_id,
|
||||
DocumentCategory.user_id == user_id,
|
||||
)
|
||||
)
|
||||
cat = result.scalar_one_or_none()
|
||||
if cat is None:
|
||||
raise HTTPException(status_code=404, detail="Category not found")
|
||||
return cat
|
||||
@@ -0,0 +1,304 @@
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import aiofiles
|
||||
import pdfplumber
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.database import AsyncSessionLocal, get_db
|
||||
from app.deps import get_user_id
|
||||
from app.models.category import DocumentCategory
|
||||
from app.models.category_assignment import CategoryAssignment
|
||||
from app.models.document import Document
|
||||
from app.schemas.document import DocumentOut, DocumentStatusOut, DocumentTypeUpdate
|
||||
from app.services.ai import get_provider
|
||||
from app.services.config_reader import load_doc_config
|
||||
from app.services.storage import delete_file, get_upload_path, save_upload
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
|
||||
result = await db.execute(
|
||||
select(Document)
|
||||
.where(Document.id == doc_id, Document.user_id == user_id)
|
||||
.options(
|
||||
selectinload(Document.category_assignments)
|
||||
.selectinload(CategoryAssignment.category)
|
||||
)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
return doc
|
||||
|
||||
|
||||
def _doc_with_categories(doc: Document) -> DocumentOut:
|
||||
from app.schemas.document import CategoryOut
|
||||
cats = [CategoryOut(id=a.category.id, name=a.category.name) for a in doc.category_assignments]
|
||||
return DocumentOut(
|
||||
id=doc.id,
|
||||
user_id=doc.user_id,
|
||||
filename=doc.filename,
|
||||
file_size=doc.file_size,
|
||||
status=doc.status,
|
||||
document_type=doc.document_type,
|
||||
extracted_data=doc.extracted_data,
|
||||
tags=doc.tags,
|
||||
error_message=doc.error_message,
|
||||
created_at=doc.created_at,
|
||||
processed_at=doc.processed_at,
|
||||
categories=cats,
|
||||
)
|
||||
|
||||
|
||||
def _extract_pdf_text(file_path: str) -> str:
|
||||
"""Synchronous — must be called via asyncio.to_thread."""
|
||||
text_parts = []
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
# ── Background processing ─────────────────────────────────────────────────────
|
||||
|
||||
async def process_document(doc_id: str) -> None:
|
||||
"""
|
||||
Runs after the upload response is sent.
|
||||
Opens its own DB session — never use the request's Depends session here.
|
||||
Loads AI config fresh from the config file so settings changes apply without restart.
|
||||
"""
|
||||
async with AsyncSessionLocal() as db:
|
||||
doc = await db.get(Document, doc_id)
|
||||
if doc is None:
|
||||
return
|
||||
|
||||
doc.status = "processing"
|
||||
await db.commit()
|
||||
|
||||
try:
|
||||
text = await asyncio.to_thread(_extract_pdf_text, doc.file_path)
|
||||
config = await load_doc_config()
|
||||
provider = get_provider(config["ai"])
|
||||
result = await provider.classify_document(text)
|
||||
|
||||
doc.raw_text = text[:500_000] # cap stored text at 500k chars
|
||||
doc.extracted_data = json.dumps(result)
|
||||
doc.document_type = result.get("document_type", "unknown")
|
||||
doc.tags = json.dumps(result.get("tags", []))
|
||||
doc.status = "done"
|
||||
doc.processed_at = datetime.now(timezone.utc)
|
||||
except Exception as exc:
|
||||
doc.status = "failed"
|
||||
doc.error_message = str(exc)[:500]
|
||||
|
||||
await db.commit()
|
||||
|
||||
|
||||
# ── Routes ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/upload", response_model=DocumentOut, status_code=202)
|
||||
async def upload_document(
|
||||
file: UploadFile,
|
||||
background_tasks: BackgroundTasks,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> DocumentOut:
|
||||
if file.content_type not in ("application/pdf", "application/octet-stream"):
|
||||
if not (file.filename or "").lower().endswith(".pdf"):
|
||||
raise HTTPException(status_code=415, detail="Only PDF files are accepted")
|
||||
|
||||
config = await load_doc_config()
|
||||
max_bytes = config.get("documents", {}).get("max_pdf_bytes", _DEFAULT_MAX_BYTES)
|
||||
|
||||
file_data = await file.read()
|
||||
if len(file_data) > max_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File exceeds maximum size of {max_bytes // (1024*1024)} MB",
|
||||
)
|
||||
|
||||
doc_id = str(uuid.uuid4())
|
||||
dest = await save_upload(file_data, user_id, doc_id)
|
||||
|
||||
doc = Document(
|
||||
id=doc_id,
|
||||
user_id=user_id,
|
||||
filename=file.filename or "upload.pdf",
|
||||
file_path=str(dest),
|
||||
file_size=len(file_data),
|
||||
status="pending",
|
||||
)
|
||||
db.add(doc)
|
||||
await db.commit()
|
||||
await db.refresh(doc)
|
||||
|
||||
background_tasks.add_task(process_document, doc_id)
|
||||
|
||||
return _doc_with_categories(doc)
|
||||
|
||||
|
||||
@router.get("", response_model=list[DocumentOut])
|
||||
async def list_documents(
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> list[DocumentOut]:
|
||||
result = await db.execute(
|
||||
select(Document)
|
||||
.where(Document.user_id == user_id)
|
||||
.options(
|
||||
selectinload(Document.category_assignments)
|
||||
.selectinload(CategoryAssignment.category)
|
||||
)
|
||||
.order_by(Document.created_at.desc())
|
||||
)
|
||||
return [_doc_with_categories(d) for d in result.scalars().all()]
|
||||
|
||||
|
||||
@router.get("/{doc_id}", response_model=DocumentOut)
|
||||
async def get_document(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> DocumentOut:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
return _doc_with_categories(doc)
|
||||
|
||||
|
||||
@router.get("/{doc_id}/status", response_model=DocumentStatusOut)
|
||||
async def get_document_status(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> Document:
|
||||
result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
return doc
|
||||
|
||||
|
||||
@router.patch("/{doc_id}/type", response_model=DocumentOut)
|
||||
async def update_document_type(
|
||||
doc_id: str,
|
||||
body: DocumentTypeUpdate,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> DocumentOut:
|
||||
doc = await _get_user_doc(doc_id, user_id, db)
|
||||
doc.document_type = body.document_type
|
||||
await db.commit()
|
||||
await db.refresh(doc)
|
||||
return _doc_with_categories(doc)
|
||||
|
||||
|
||||
@router.delete("/{doc_id}", status_code=204)
|
||||
async def delete_document(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
delete_file(doc.file_path)
|
||||
await db.delete(doc)
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.get("/{doc_id}/file")
|
||||
async def download_file(
|
||||
doc_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> StreamingResponse:
|
||||
result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
)
|
||||
doc = result.scalar_one_or_none()
|
||||
if doc is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
async def file_generator():
|
||||
async with aiofiles.open(doc.file_path, "rb") as f:
|
||||
while chunk := await f.read(64 * 1024):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
file_generator(),
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f'inline; filename="{doc.filename}"'},
|
||||
)
|
||||
|
||||
|
||||
# ── Category assignment ───────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/{doc_id}/categories/{cat_id}", status_code=204)
|
||||
async def assign_category(
|
||||
doc_id: str,
|
||||
cat_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
# Verify both belong to this user
|
||||
doc_result = await db.execute(
|
||||
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
|
||||
)
|
||||
if doc_result.scalar_one_or_none() is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
cat_result = await db.execute(
|
||||
select(DocumentCategory).where(
|
||||
DocumentCategory.id == cat_id, DocumentCategory.user_id == user_id
|
||||
)
|
||||
)
|
||||
if cat_result.scalar_one_or_none() is None:
|
||||
raise HTTPException(status_code=404, detail="Category not found")
|
||||
|
||||
# Upsert — ignore if already assigned
|
||||
existing = await db.execute(
|
||||
select(CategoryAssignment).where(
|
||||
CategoryAssignment.document_id == doc_id,
|
||||
CategoryAssignment.category_id == cat_id,
|
||||
)
|
||||
)
|
||||
if existing.scalar_one_or_none() is None:
|
||||
db.add(CategoryAssignment(document_id=doc_id, category_id=cat_id))
|
||||
await db.commit()
|
||||
|
||||
|
||||
@router.delete("/{doc_id}/categories/{cat_id}", status_code=204)
|
||||
async def remove_category(
|
||||
doc_id: str,
|
||||
cat_id: str,
|
||||
user_id: str = Depends(get_user_id),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
) -> None:
|
||||
result = await db.execute(
|
||||
select(CategoryAssignment).where(
|
||||
CategoryAssignment.document_id == doc_id,
|
||||
CategoryAssignment.category_id == cat_id,
|
||||
)
|
||||
)
|
||||
assignment = result.scalar_one_or_none()
|
||||
if assignment:
|
||||
await db.delete(assignment)
|
||||
await db.commit()
|
||||
@@ -0,0 +1,20 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class CategoryOut(BaseModel):
|
||||
id: str
|
||||
user_id: str
|
||||
name: str
|
||||
created_at: datetime
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class CategoryCreate(BaseModel):
|
||||
name: str
|
||||
|
||||
|
||||
class CategoryUpdate(BaseModel):
|
||||
name: str
|
||||
@@ -0,0 +1,39 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class CategoryOut(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class DocumentOut(BaseModel):
|
||||
id: str
|
||||
user_id: str
|
||||
filename: str
|
||||
file_size: int
|
||||
status: str
|
||||
document_type: str | None
|
||||
extracted_data: str | None # JSON string — frontend calls JSON.parse()
|
||||
tags: str | None # JSON array string
|
||||
error_message: str | None
|
||||
created_at: datetime
|
||||
processed_at: datetime | None
|
||||
categories: list[CategoryOut] = []
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class DocumentStatusOut(BaseModel):
|
||||
id: str
|
||||
status: str
|
||||
error_message: str | None
|
||||
processed_at: datetime | None
|
||||
|
||||
model_config = {"from_attributes": True}
|
||||
|
||||
|
||||
class DocumentTypeUpdate(BaseModel):
|
||||
document_type: str
|
||||
@@ -0,0 +1,23 @@
|
||||
from app.services.ai.base import AIProvider
|
||||
|
||||
|
||||
def get_provider(ai_config: dict) -> AIProvider:
|
||||
"""
|
||||
Factory: return an AIProvider instance based on the 'provider' key in the AI config section.
|
||||
ai_config is the 'ai' section of doc_service_config.json, loaded fresh per processing job.
|
||||
"""
|
||||
provider_name = ai_config.get("provider", "anthropic")
|
||||
provider_cfg = ai_config.get(provider_name, {})
|
||||
|
||||
match provider_name:
|
||||
case "anthropic":
|
||||
from app.services.ai.anthropic_provider import AnthropicProvider
|
||||
return AnthropicProvider(provider_cfg)
|
||||
case "ollama" | "lmstudio":
|
||||
from app.services.ai.openai_compat import OpenAICompatProvider
|
||||
return OpenAICompatProvider(provider_cfg)
|
||||
case _:
|
||||
raise ValueError(f"Unknown AI provider: {provider_name!r}")
|
||||
|
||||
|
||||
__all__ = ["AIProvider", "get_provider"]
|
||||
@@ -0,0 +1,31 @@
|
||||
import json
|
||||
|
||||
from anthropic import AsyncAnthropic
|
||||
|
||||
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
class AnthropicProvider(AIProvider):
|
||||
def __init__(self, config: dict) -> None:
|
||||
self._client = AsyncAnthropic(api_key=config["api_key"])
|
||||
self._model = config.get("model", "claude-haiku-4-5-20251001")
|
||||
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
message = await self._client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": USER_PROMPT_TEMPLATE.format(text=text[:100_000]),
|
||||
}],
|
||||
)
|
||||
raw = message.content[0].text.strip()
|
||||
return _parse_json(raw)
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> dict:
|
||||
# Strip accidental markdown fences despite explicit instruction not to include them
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
|
||||
return json.loads(raw)
|
||||
@@ -0,0 +1,31 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a financial document analysis assistant. "
|
||||
"Given the text extracted from a PDF document, return ONLY a JSON object "
|
||||
"with no markdown, no code fences, and no explanation."
|
||||
)
|
||||
|
||||
USER_PROMPT_TEMPLATE = """Analyze the following document text and return a JSON object with exactly these keys:
|
||||
document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown),
|
||||
total_amount (string or null),
|
||||
currency (string or null),
|
||||
vendor_name (string or null),
|
||||
customer_name (string or null),
|
||||
billing_address (string or null),
|
||||
customer_address (string or null),
|
||||
invoice_number (string or null),
|
||||
invoice_date (string or null),
|
||||
due_date (string or null),
|
||||
tags (array of strings),
|
||||
line_items (array of objects, each with keys: description, amount).
|
||||
|
||||
Document text:
|
||||
{text}"""
|
||||
|
||||
|
||||
class AIProvider(ABC):
|
||||
@abstractmethod
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
"""Return structured extraction dict from document text."""
|
||||
...
|
||||
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
OpenAI-compatible provider for Ollama and LM Studio.
|
||||
Both expose an OpenAI-compatible /v1/chat/completions endpoint.
|
||||
"""
|
||||
import json
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
class OpenAICompatProvider(AIProvider):
|
||||
def __init__(self, config: dict) -> None:
|
||||
self._client = AsyncOpenAI(
|
||||
base_url=config["base_url"],
|
||||
api_key=config.get("api_key", "not-required"),
|
||||
)
|
||||
self._model = config["model"]
|
||||
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
temperature=0,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000])},
|
||||
],
|
||||
)
|
||||
raw = response.choices[0].message.content.strip()
|
||||
return _parse_json(raw)
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> dict:
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
|
||||
return json.loads(raw)
|
||||
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Reads doc_service_config.json from the shared config volume.
|
||||
Caches the result for 30 seconds to avoid hitting the filesystem on every request.
|
||||
Uses asyncio.to_thread so the synchronous file read doesn't block the event loop.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
_DEFAULT_CONFIG: dict = {
|
||||
"ai": {
|
||||
"provider": "anthropic",
|
||||
"anthropic": {"api_key": "", "model": "claude-haiku-4-5-20251001"},
|
||||
"ollama": {"base_url": "http://localhost:11434/v1", "model": "llama3.2", "api_key": "ollama"},
|
||||
"lmstudio": {"base_url": "http://localhost:1234/v1", "model": "local-model", "api_key": ""},
|
||||
},
|
||||
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
|
||||
}
|
||||
|
||||
_cache: dict | None = None
|
||||
_cache_at: float = 0.0
|
||||
_CACHE_TTL = 30.0
|
||||
|
||||
|
||||
def _read_config_sync() -> dict:
|
||||
path = Path(settings.CONFIG_PATH)
|
||||
if not path.exists():
|
||||
return _DEFAULT_CONFIG.copy()
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
async def load_doc_config() -> dict:
|
||||
global _cache, _cache_at
|
||||
now = time.monotonic()
|
||||
if _cache is not None and (now - _cache_at) < _CACHE_TTL:
|
||||
return _cache
|
||||
data = await asyncio.to_thread(_read_config_sync)
|
||||
_cache = data
|
||||
_cache_at = now
|
||||
return data
|
||||
@@ -0,0 +1,27 @@
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
def get_upload_path(user_id: str, doc_id: str) -> Path:
|
||||
"""Return /data/documents/{user_id}/{doc_id}.pdf, creating the directory if needed."""
|
||||
user_dir = Path(settings.DATA_DIR) / user_id
|
||||
user_dir.mkdir(parents=True, exist_ok=True)
|
||||
return user_dir / f"{doc_id}.pdf"
|
||||
|
||||
|
||||
async def save_upload(file_data: bytes, user_id: str, doc_id: str) -> Path:
|
||||
dest = get_upload_path(user_id, doc_id)
|
||||
async with aiofiles.open(dest, "wb") as f:
|
||||
await f.write(file_data)
|
||||
return dest
|
||||
|
||||
|
||||
def delete_file(file_path: str) -> None:
|
||||
try:
|
||||
Path(file_path).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass # log but do not raise — deletion failure must not 500
|
||||
@@ -0,0 +1,35 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=45"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "doc-service"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"fastapi>=0.111",
|
||||
"uvicorn[standard]>=0.29",
|
||||
"sqlalchemy[asyncio]>=2.0",
|
||||
"asyncpg>=0.29",
|
||||
"alembic>=1.13",
|
||||
"pydantic-settings>=2.2",
|
||||
"anthropic>=0.28",
|
||||
"openai>=1.0",
|
||||
"pdfplumber>=0.11",
|
||||
"aiofiles>=23.0",
|
||||
"python-multipart>=0.0.9",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8",
|
||||
"pytest-asyncio>=0.23",
|
||||
"httpx>=0.27",
|
||||
"ruff>=0.4",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
@@ -0,0 +1,8 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
echo "[doc-service] running migrations..."
|
||||
alembic upgrade head
|
||||
|
||||
echo "[doc-service] starting uvicorn..."
|
||||
exec uvicorn app.main:app --host 0.0.0.0 --port 8001
|
||||
@@ -0,0 +1,8 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
echo "[doc-service] running migrations..."
|
||||
alembic upgrade head
|
||||
|
||||
echo "[doc-service] starting uvicorn (dev)..."
|
||||
exec uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload
|
||||
Reference in New Issue
Block a user