Add PDF document service with AI extraction and per-app settings

- New `features/doc-service` FastAPI microservice: PDF upload, async
  text extraction (pdfplumber), AI classification via Anthropic/Ollama/
  LM Studio, per-user categories, file download
- Alembic migration isolated with `alembic_version_doc_service` table
- Main backend: httpx proxy routers for /api/documents/* and
  /api/documents/categories/*, admin settings API at /api/settings/*
- Runtime config in /config/doc_service_config.json (shared Docker
  volume); api_key masking on reads; atomic write with os.replace()
- Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage
  launcher hub, simplified Nav (removed Settings link), new routes
- docker-compose: doc-service service, doc_data + app_config volumes,
  removed internal:true from backend-net for outbound AI API calls
- Fix pre-commit hook: probe Docker socket path so git subprocess picks
  up Docker Desktop on macOS
- Fix security_check.py: use sys.executable for bandit so venv python
  is used instead of system python

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-14 05:28:11 +02:00
parent d423bea134
commit 0d34867a69
52 changed files with 2500 additions and 28 deletions
+3
View File
@@ -0,0 +1,3 @@
DATABASE_URL=postgresql+asyncpg://postgres:password@db:5432/destroying_sap
DATA_DIR=/data/documents
CONFIG_PATH=/config/doc_service_config.json
+34
View File
@@ -0,0 +1,34 @@
# ── Stage 1: dependency installation ─────────────────────────────────────────
FROM python:3.12-slim AS builder
WORKDIR /app
RUN pip install --upgrade pip
COPY pyproject.toml .
RUN pip install --prefix=/install .
# ── Stage 2: runtime ──────────────────────────────────────────────────────────
FROM python:3.12-slim
# Create non-root user (UID/GID 1001)
RUN groupadd --gid 1001 appuser && \
useradd --uid 1001 --gid 1001 --no-create-home --shell /bin/sh appuser
# Pre-create data and config dirs with correct ownership.
# Named volumes mounted over these paths will inherit ownership on first creation.
RUN mkdir -p /data/documents /config && chown -R appuser:appuser /data /config
WORKDIR /app
COPY --from=builder /install /usr/local
COPY --chown=appuser:appuser app ./app
COPY --chown=appuser:appuser alembic ./alembic
COPY --chown=appuser:appuser alembic.ini .
COPY --chown=appuser:appuser scripts ./scripts
USER appuser
EXPOSE 8001
CMD ["sh", "scripts/start.sh"]
+45
View File
@@ -0,0 +1,45 @@
[alembic]
script_location = alembic
prepend_sys_path = .
version_path_separator = os
sqlalchemy.url = postgresql+asyncpg://postgres:password@localhost:5432/destroying_sap
# Use a separate version table so this service's migrations don't collide
# with the main backend's alembic_version table in the shared postgres instance.
version_table = alembic_version_doc_service
[post_write_hooks]
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
+55
View File
@@ -0,0 +1,55 @@
import asyncio
from logging.config import fileConfig
from alembic import context
from sqlalchemy.ext.asyncio import create_async_engine
from app.core.config import settings
from app.database import Base
import app.models # noqa: F401 — registers Document, DocumentCategory, CategoryAssignment
config = context.config
config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
if config.config_file_name:
fileConfig(config.config_file_name)
target_metadata = Base.metadata
# Separate version table — must not collide with the main backend's alembic_version table.
VERSION_TABLE = "alembic_version_doc_service"
def run_migrations_offline():
context.configure(
url=settings.DATABASE_URL,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
version_table=VERSION_TABLE,
)
with context.begin_transaction():
context.run_migrations()
def do_run_migrations(connection):
context.configure(
connection=connection,
target_metadata=target_metadata,
version_table=VERSION_TABLE,
)
with context.begin_transaction():
context.run_migrations()
async def run_migrations_online():
engine = create_async_engine(settings.DATABASE_URL)
async with engine.connect() as conn:
await conn.run_sync(do_run_migrations)
await engine.dispose()
if context.is_offline_mode():
run_migrations_offline()
else:
asyncio.run(run_migrations_online())
@@ -0,0 +1,25 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
revision: str = ${repr(up_revision)}
down_revision: Union[str, None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}
@@ -0,0 +1,79 @@
"""create document tables
Revision ID: 0001
Revises:
Create Date: 2026-04-14
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
revision: str = "0001"
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
"documents",
sa.Column("id", sa.String(), primary_key=True),
sa.Column("user_id", sa.String(), nullable=False),
sa.Column("filename", sa.String(), nullable=False),
sa.Column("file_path", sa.String(), nullable=False),
sa.Column("file_size", sa.Integer(), nullable=False),
sa.Column("status", sa.String(), nullable=False),
sa.Column("document_type", sa.String(), nullable=True),
sa.Column("raw_text", sa.Text(), nullable=True),
sa.Column("extracted_data", sa.Text(), nullable=True),
sa.Column("tags", sa.Text(), nullable=True),
sa.Column("error_message", sa.String(500), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column("processed_at", sa.DateTime(timezone=True), nullable=True),
)
op.create_index("ix_documents_user_id", "documents", ["user_id"])
op.create_table(
"document_categories",
sa.Column("id", sa.String(), primary_key=True),
sa.Column("user_id", sa.String(), nullable=False),
sa.Column("name", sa.String(128), nullable=False),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
op.create_index("ix_document_categories_user_id", "document_categories", ["user_id"])
op.create_table(
"document_category_assignments",
sa.Column(
"document_id",
sa.String(),
sa.ForeignKey("documents.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column(
"category_id",
sa.String(),
sa.ForeignKey("document_categories.id", ondelete="CASCADE"),
primary_key=True,
),
)
def downgrade() -> None:
op.drop_table("document_category_assignments")
op.drop_index("ix_document_categories_user_id", "document_categories")
op.drop_table("document_categories")
op.drop_index("ix_documents_user_id", "documents")
op.drop_table("documents")
+14
View File
@@ -0,0 +1,14 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
PROJECT_NAME: str = "doc-service"
DATABASE_URL: str = "postgresql+asyncpg://postgres:password@db:5432/destroying_sap"
DATA_DIR: str = "/data/documents"
CONFIG_PATH: str = "/config/doc_service_config.json"
class Config:
env_file = ".env"
settings = Settings()
+16
View File
@@ -0,0 +1,16 @@
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase
from app.core.config import settings
engine = create_async_engine(settings.DATABASE_URL, echo=False)
AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False)
class Base(DeclarativeBase):
pass
async def get_db() -> AsyncSession:
async with AsyncSessionLocal() as session:
yield session
+12
View File
@@ -0,0 +1,12 @@
from fastapi import Header, HTTPException
async def get_user_id(x_user_id: str = Header(...)) -> str:
"""
Extract the user identity injected by the main backend proxy.
The main backend validates the JWT and forwards the user ID via this header.
Doc-service trusts it because it is only reachable from backend on backend-net.
"""
if not x_user_id:
raise HTTPException(status_code=400, detail="Missing X-User-Id header")
return x_user_id
+17
View File
@@ -0,0 +1,17 @@
from fastapi import FastAPI
from app.core.config import settings
from app.routers import categories, documents
app = FastAPI(title=settings.PROJECT_NAME)
# No CORS — this service is only reachable from the main backend on backend-net.
# All browser traffic goes through the main backend proxy.
app.include_router(documents.router, prefix="/documents", tags=["documents"])
app.include_router(categories.router, prefix="/categories", tags=["categories"])
@app.get("/health")
def health():
return {"status": "ok"}
@@ -0,0 +1,5 @@
from app.models.document import Document
from app.models.category import DocumentCategory
from app.models.category_assignment import CategoryAssignment
__all__ = ["Document", "DocumentCategory", "CategoryAssignment"]
@@ -0,0 +1,22 @@
import uuid
from datetime import datetime
from sqlalchemy import DateTime, String, func
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class DocumentCategory(Base):
__tablename__ = "document_categories"
id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
user_id: Mapped[str] = mapped_column(String, nullable=False, index=True)
name: Mapped[str] = mapped_column(String(128), nullable=False)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
assignments: Mapped[list["CategoryAssignment"]] = relationship(
"CategoryAssignment", back_populates="category", cascade="all, delete-orphan"
)
@@ -0,0 +1,20 @@
from sqlalchemy import ForeignKey, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class CategoryAssignment(Base):
__tablename__ = "document_category_assignments"
document_id: Mapped[str] = mapped_column(
String, ForeignKey("documents.id", ondelete="CASCADE"), primary_key=True
)
category_id: Mapped[str] = mapped_column(
String, ForeignKey("document_categories.id", ondelete="CASCADE"), primary_key=True
)
document: Mapped["Document"] = relationship("Document", back_populates="category_assignments")
category: Mapped["DocumentCategory"] = relationship(
"DocumentCategory", back_populates="assignments"
)
@@ -0,0 +1,31 @@
import uuid
from datetime import datetime
from sqlalchemy import DateTime, Integer, String, Text, func
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class Document(Base):
__tablename__ = "documents"
id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
user_id: Mapped[str] = mapped_column(String, nullable=False, index=True)
filename: Mapped[str] = mapped_column(String, nullable=False)
file_path: Mapped[str] = mapped_column(String, nullable=False)
file_size: Mapped[int] = mapped_column(Integer, nullable=False)
status: Mapped[str] = mapped_column(String, nullable=False, default="pending")
document_type: Mapped[str | None] = mapped_column(String, nullable=True)
raw_text: Mapped[str | None] = mapped_column(Text, nullable=True)
extracted_data: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON string
tags: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON array string
error_message: Mapped[str | None] = mapped_column(String(500), nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
"CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
)
@@ -0,0 +1,80 @@
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import get_db
from app.deps import get_user_id
from app.models.category import DocumentCategory
from app.schemas.category import CategoryCreate, CategoryOut, CategoryUpdate
router = APIRouter()
@router.get("", response_model=list[CategoryOut])
async def list_categories(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> list[DocumentCategory]:
result = await db.execute(
select(DocumentCategory)
.where(DocumentCategory.user_id == user_id)
.order_by(DocumentCategory.name)
)
return result.scalars().all()
@router.post("", response_model=CategoryOut, status_code=201)
async def create_category(
body: CategoryCreate,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> DocumentCategory:
name = body.name.strip()
if not name:
raise HTTPException(status_code=422, detail="Category name cannot be empty")
cat = DocumentCategory(user_id=user_id, name=name[:128])
db.add(cat)
await db.commit()
await db.refresh(cat)
return cat
@router.patch("/{cat_id}", response_model=CategoryOut)
async def rename_category(
cat_id: str,
body: CategoryUpdate,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> DocumentCategory:
cat = await _get_user_cat(cat_id, user_id, db)
name = body.name.strip()
if not name:
raise HTTPException(status_code=422, detail="Category name cannot be empty")
cat.name = name[:128]
await db.commit()
await db.refresh(cat)
return cat
@router.delete("/{cat_id}", status_code=204)
async def delete_category(
cat_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
cat = await _get_user_cat(cat_id, user_id, db)
await db.delete(cat)
await db.commit()
async def _get_user_cat(cat_id: str, user_id: str, db: AsyncSession) -> DocumentCategory:
result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.id == cat_id,
DocumentCategory.user_id == user_id,
)
)
cat = result.scalar_one_or_none()
if cat is None:
raise HTTPException(status_code=404, detail="Category not found")
return cat
@@ -0,0 +1,304 @@
import asyncio
import json
import uuid
from datetime import datetime, timezone
import aiofiles
import pdfplumber
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.database import AsyncSessionLocal, get_db
from app.deps import get_user_id
from app.models.category import DocumentCategory
from app.models.category_assignment import CategoryAssignment
from app.models.document import Document
from app.schemas.document import DocumentOut, DocumentStatusOut, DocumentTypeUpdate
from app.services.ai import get_provider
from app.services.config_reader import load_doc_config
from app.services.storage import delete_file, get_upload_path, save_upload
router = APIRouter()
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
# ── Helpers ───────────────────────────────────────────────────────────────────
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
result = await db.execute(
select(Document)
.where(Document.id == doc_id, Document.user_id == user_id)
.options(
selectinload(Document.category_assignments)
.selectinload(CategoryAssignment.category)
)
)
doc = result.scalar_one_or_none()
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
return doc
def _doc_with_categories(doc: Document) -> DocumentOut:
from app.schemas.document import CategoryOut
cats = [CategoryOut(id=a.category.id, name=a.category.name) for a in doc.category_assignments]
return DocumentOut(
id=doc.id,
user_id=doc.user_id,
filename=doc.filename,
file_size=doc.file_size,
status=doc.status,
document_type=doc.document_type,
extracted_data=doc.extracted_data,
tags=doc.tags,
error_message=doc.error_message,
created_at=doc.created_at,
processed_at=doc.processed_at,
categories=cats,
)
def _extract_pdf_text(file_path: str) -> str:
"""Synchronous — must be called via asyncio.to_thread."""
text_parts = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return "\n".join(text_parts)
# ── Background processing ─────────────────────────────────────────────────────
async def process_document(doc_id: str) -> None:
"""
Runs after the upload response is sent.
Opens its own DB session — never use the request's Depends session here.
Loads AI config fresh from the config file so settings changes apply without restart.
"""
async with AsyncSessionLocal() as db:
doc = await db.get(Document, doc_id)
if doc is None:
return
doc.status = "processing"
await db.commit()
try:
text = await asyncio.to_thread(_extract_pdf_text, doc.file_path)
config = await load_doc_config()
provider = get_provider(config["ai"])
result = await provider.classify_document(text)
doc.raw_text = text[:500_000] # cap stored text at 500k chars
doc.extracted_data = json.dumps(result)
doc.document_type = result.get("document_type", "unknown")
doc.tags = json.dumps(result.get("tags", []))
doc.status = "done"
doc.processed_at = datetime.now(timezone.utc)
except Exception as exc:
doc.status = "failed"
doc.error_message = str(exc)[:500]
await db.commit()
# ── Routes ────────────────────────────────────────────────────────────────────
@router.post("/upload", response_model=DocumentOut, status_code=202)
async def upload_document(
file: UploadFile,
background_tasks: BackgroundTasks,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> DocumentOut:
if file.content_type not in ("application/pdf", "application/octet-stream"):
if not (file.filename or "").lower().endswith(".pdf"):
raise HTTPException(status_code=415, detail="Only PDF files are accepted")
config = await load_doc_config()
max_bytes = config.get("documents", {}).get("max_pdf_bytes", _DEFAULT_MAX_BYTES)
file_data = await file.read()
if len(file_data) > max_bytes:
raise HTTPException(
status_code=413,
detail=f"File exceeds maximum size of {max_bytes // (1024*1024)} MB",
)
doc_id = str(uuid.uuid4())
dest = await save_upload(file_data, user_id, doc_id)
doc = Document(
id=doc_id,
user_id=user_id,
filename=file.filename or "upload.pdf",
file_path=str(dest),
file_size=len(file_data),
status="pending",
)
db.add(doc)
await db.commit()
await db.refresh(doc)
background_tasks.add_task(process_document, doc_id)
return _doc_with_categories(doc)
@router.get("", response_model=list[DocumentOut])
async def list_documents(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> list[DocumentOut]:
result = await db.execute(
select(Document)
.where(Document.user_id == user_id)
.options(
selectinload(Document.category_assignments)
.selectinload(CategoryAssignment.category)
)
.order_by(Document.created_at.desc())
)
return [_doc_with_categories(d) for d in result.scalars().all()]
@router.get("/{doc_id}", response_model=DocumentOut)
async def get_document(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> DocumentOut:
doc = await _get_user_doc(doc_id, user_id, db)
return _doc_with_categories(doc)
@router.get("/{doc_id}/status", response_model=DocumentStatusOut)
async def get_document_status(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> Document:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
)
doc = result.scalar_one_or_none()
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
return doc
@router.patch("/{doc_id}/type", response_model=DocumentOut)
async def update_document_type(
doc_id: str,
body: DocumentTypeUpdate,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> DocumentOut:
doc = await _get_user_doc(doc_id, user_id, db)
doc.document_type = body.document_type
await db.commit()
await db.refresh(doc)
return _doc_with_categories(doc)
@router.delete("/{doc_id}", status_code=204)
async def delete_document(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
)
doc = result.scalar_one_or_none()
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
delete_file(doc.file_path)
await db.delete(doc)
await db.commit()
@router.get("/{doc_id}/file")
async def download_file(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> StreamingResponse:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
)
doc = result.scalar_one_or_none()
if doc is None:
raise HTTPException(status_code=404, detail="Document not found")
async def file_generator():
async with aiofiles.open(doc.file_path, "rb") as f:
while chunk := await f.read(64 * 1024):
yield chunk
return StreamingResponse(
file_generator(),
media_type="application/pdf",
headers={"Content-Disposition": f'inline; filename="{doc.filename}"'},
)
# ── Category assignment ───────────────────────────────────────────────────────
@router.post("/{doc_id}/categories/{cat_id}", status_code=204)
async def assign_category(
doc_id: str,
cat_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
# Verify both belong to this user
doc_result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
)
if doc_result.scalar_one_or_none() is None:
raise HTTPException(status_code=404, detail="Document not found")
cat_result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.id == cat_id, DocumentCategory.user_id == user_id
)
)
if cat_result.scalar_one_or_none() is None:
raise HTTPException(status_code=404, detail="Category not found")
# Upsert — ignore if already assigned
existing = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat_id,
)
)
if existing.scalar_one_or_none() is None:
db.add(CategoryAssignment(document_id=doc_id, category_id=cat_id))
await db.commit()
@router.delete("/{doc_id}/categories/{cat_id}", status_code=204)
async def remove_category(
doc_id: str,
cat_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
result = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat_id,
)
)
assignment = result.scalar_one_or_none()
if assignment:
await db.delete(assignment)
await db.commit()
@@ -0,0 +1,20 @@
from datetime import datetime
from pydantic import BaseModel
class CategoryOut(BaseModel):
id: str
user_id: str
name: str
created_at: datetime
model_config = {"from_attributes": True}
class CategoryCreate(BaseModel):
name: str
class CategoryUpdate(BaseModel):
name: str
@@ -0,0 +1,39 @@
from datetime import datetime
from pydantic import BaseModel
class CategoryOut(BaseModel):
id: str
name: str
model_config = {"from_attributes": True}
class DocumentOut(BaseModel):
id: str
user_id: str
filename: str
file_size: int
status: str
document_type: str | None
extracted_data: str | None # JSON string — frontend calls JSON.parse()
tags: str | None # JSON array string
error_message: str | None
created_at: datetime
processed_at: datetime | None
categories: list[CategoryOut] = []
model_config = {"from_attributes": True}
class DocumentStatusOut(BaseModel):
id: str
status: str
error_message: str | None
processed_at: datetime | None
model_config = {"from_attributes": True}
class DocumentTypeUpdate(BaseModel):
document_type: str
@@ -0,0 +1,23 @@
from app.services.ai.base import AIProvider
def get_provider(ai_config: dict) -> AIProvider:
"""
Factory: return an AIProvider instance based on the 'provider' key in the AI config section.
ai_config is the 'ai' section of doc_service_config.json, loaded fresh per processing job.
"""
provider_name = ai_config.get("provider", "anthropic")
provider_cfg = ai_config.get(provider_name, {})
match provider_name:
case "anthropic":
from app.services.ai.anthropic_provider import AnthropicProvider
return AnthropicProvider(provider_cfg)
case "ollama" | "lmstudio":
from app.services.ai.openai_compat import OpenAICompatProvider
return OpenAICompatProvider(provider_cfg)
case _:
raise ValueError(f"Unknown AI provider: {provider_name!r}")
__all__ = ["AIProvider", "get_provider"]
@@ -0,0 +1,31 @@
import json
from anthropic import AsyncAnthropic
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
class AnthropicProvider(AIProvider):
def __init__(self, config: dict) -> None:
self._client = AsyncAnthropic(api_key=config["api_key"])
self._model = config.get("model", "claude-haiku-4-5-20251001")
async def classify_document(self, text: str) -> dict:
message = await self._client.messages.create(
model=self._model,
max_tokens=2048,
system=SYSTEM_PROMPT,
messages=[{
"role": "user",
"content": USER_PROMPT_TEMPLATE.format(text=text[:100_000]),
}],
)
raw = message.content[0].text.strip()
return _parse_json(raw)
def _parse_json(raw: str) -> dict:
# Strip accidental markdown fences despite explicit instruction not to include them
if raw.startswith("```"):
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
return json.loads(raw)
@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod
SYSTEM_PROMPT = (
"You are a financial document analysis assistant. "
"Given the text extracted from a PDF document, return ONLY a JSON object "
"with no markdown, no code fences, and no explanation."
)
USER_PROMPT_TEMPLATE = """Analyze the following document text and return a JSON object with exactly these keys:
document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown),
total_amount (string or null),
currency (string or null),
vendor_name (string or null),
customer_name (string or null),
billing_address (string or null),
customer_address (string or null),
invoice_number (string or null),
invoice_date (string or null),
due_date (string or null),
tags (array of strings),
line_items (array of objects, each with keys: description, amount).
Document text:
{text}"""
class AIProvider(ABC):
@abstractmethod
async def classify_document(self, text: str) -> dict:
"""Return structured extraction dict from document text."""
...
@@ -0,0 +1,36 @@
"""
OpenAI-compatible provider for Ollama and LM Studio.
Both expose an OpenAI-compatible /v1/chat/completions endpoint.
"""
import json
from openai import AsyncOpenAI
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
class OpenAICompatProvider(AIProvider):
def __init__(self, config: dict) -> None:
self._client = AsyncOpenAI(
base_url=config["base_url"],
api_key=config.get("api_key", "not-required"),
)
self._model = config["model"]
async def classify_document(self, text: str) -> dict:
response = await self._client.chat.completions.create(
model=self._model,
temperature=0,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000])},
],
)
raw = response.choices[0].message.content.strip()
return _parse_json(raw)
def _parse_json(raw: str) -> dict:
if raw.startswith("```"):
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
return json.loads(raw)
@@ -0,0 +1,44 @@
"""
Reads doc_service_config.json from the shared config volume.
Caches the result for 30 seconds to avoid hitting the filesystem on every request.
Uses asyncio.to_thread so the synchronous file read doesn't block the event loop.
"""
import asyncio
import json
import time
from pathlib import Path
from app.core.config import settings
_DEFAULT_CONFIG: dict = {
"ai": {
"provider": "anthropic",
"anthropic": {"api_key": "", "model": "claude-haiku-4-5-20251001"},
"ollama": {"base_url": "http://localhost:11434/v1", "model": "llama3.2", "api_key": "ollama"},
"lmstudio": {"base_url": "http://localhost:1234/v1", "model": "local-model", "api_key": ""},
},
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
}
_cache: dict | None = None
_cache_at: float = 0.0
_CACHE_TTL = 30.0
def _read_config_sync() -> dict:
path = Path(settings.CONFIG_PATH)
if not path.exists():
return _DEFAULT_CONFIG.copy()
with open(path) as f:
return json.load(f)
async def load_doc_config() -> dict:
global _cache, _cache_at
now = time.monotonic()
if _cache is not None and (now - _cache_at) < _CACHE_TTL:
return _cache
data = await asyncio.to_thread(_read_config_sync)
_cache = data
_cache_at = now
return data
@@ -0,0 +1,27 @@
import asyncio
from pathlib import Path
import aiofiles
from app.core.config import settings
def get_upload_path(user_id: str, doc_id: str) -> Path:
"""Return /data/documents/{user_id}/{doc_id}.pdf, creating the directory if needed."""
user_dir = Path(settings.DATA_DIR) / user_id
user_dir.mkdir(parents=True, exist_ok=True)
return user_dir / f"{doc_id}.pdf"
async def save_upload(file_data: bytes, user_id: str, doc_id: str) -> Path:
dest = get_upload_path(user_id, doc_id)
async with aiofiles.open(dest, "wb") as f:
await f.write(file_data)
return dest
def delete_file(file_path: str) -> None:
try:
Path(file_path).unlink(missing_ok=True)
except OSError:
pass # log but do not raise — deletion failure must not 500
+35
View File
@@ -0,0 +1,35 @@
[build-system]
requires = ["setuptools>=45"]
build-backend = "setuptools.build_meta"
[project]
name = "doc-service"
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"fastapi>=0.111",
"uvicorn[standard]>=0.29",
"sqlalchemy[asyncio]>=2.0",
"asyncpg>=0.29",
"alembic>=1.13",
"pydantic-settings>=2.2",
"anthropic>=0.28",
"openai>=1.0",
"pdfplumber>=0.11",
"aiofiles>=23.0",
"python-multipart>=0.0.9",
]
[project.optional-dependencies]
dev = [
"pytest>=8",
"pytest-asyncio>=0.23",
"httpx>=0.27",
"ruff>=0.4",
]
[tool.pytest.ini_options]
asyncio_mode = "auto"
[tool.ruff]
line-length = 100
+8
View File
@@ -0,0 +1,8 @@
#!/bin/sh
set -e
echo "[doc-service] running migrations..."
alembic upgrade head
echo "[doc-service] starting uvicorn..."
exec uvicorn app.main:app --host 0.0.0.0 --port 8001
@@ -0,0 +1,8 @@
#!/bin/sh
set -e
echo "[doc-service] running migrations..."
alembic upgrade head
echo "[doc-service] starting uvicorn (dev)..."
exec uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload