1882edfff6
- backend/api/auth.py: register, login (TOTP+backup), refresh, logout, me, change-password; per-account Redis rate limit; HIBP check - backend/main.py: Origin validation middleware, CSP headers middleware, CORS locked to settings.cors_origins, Redis lifespan (app.state.redis), admin bootstrap, auth router included, slowapi SlowAPIMiddleware - backend/services/email.py: already created in Plan 01 (verified exists) - Python 3.9 compat: fixed match statement in ai/__init__.py, str|None union syntax in openai_provider.py, api/documents.py, api/topics.py, api/settings.py, services/classifier.py All 17 tests in test_auth_api.py pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from deps.db import get_db
|
|
from services import classifier, extractor, storage
|
|
from tasks.document_tasks import extract_and_classify
|
|
|
|
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
|
|
|
ALLOWED_MIME_TYPES = {
|
|
"application/pdf",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/msword",
|
|
"text/plain",
|
|
"text/markdown",
|
|
"image/png",
|
|
"image/jpeg",
|
|
"image/jpg",
|
|
"image/tiff",
|
|
"image/webp",
|
|
}
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload_document(
|
|
file: UploadFile = File(...),
|
|
auto_classify: bool = Form(True),
|
|
session: AsyncSession = Depends(get_db),
|
|
):
|
|
content = await file.read()
|
|
if len(content) == 0:
|
|
raise HTTPException(400, "Empty file")
|
|
|
|
mime = file.content_type or "application/octet-stream"
|
|
|
|
saved = await storage.save_upload(session, content, file.filename or "upload", mime)
|
|
|
|
# Extract text from the in-memory bytes (avoid a second MinIO round-trip at upload time)
|
|
text = extractor.extract_text_from_bytes(content, mime)
|
|
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
meta = {
|
|
"id": saved["id"],
|
|
"original_name": file.filename or "upload",
|
|
"filename": saved["filename"],
|
|
"mime_type": mime,
|
|
"size_bytes": len(content),
|
|
"extracted_text": text,
|
|
# Phase 1 cutover: topics are empty at upload time; the Celery worker
|
|
# fills them in asynchronously after extract_and_classify completes.
|
|
"topics": [],
|
|
"created_at": now,
|
|
"classified_at": None,
|
|
}
|
|
await storage.save_metadata(session, meta)
|
|
|
|
if auto_classify:
|
|
# Queue the extract+classify task on the Celery documents queue (STORE-08).
|
|
# The task re-fetches bytes from MinIO, extracts text, and classifies.
|
|
# The upload response returns topics=[] — polling GET /api/documents/{id}
|
|
# will show the populated topics once the worker completes.
|
|
extract_and_classify.delay(str(saved["id"]))
|
|
|
|
return meta
|
|
|
|
|
|
@router.get("")
|
|
async def list_documents(
|
|
topic: Optional[str] = Query(None),
|
|
page: int = Query(1, ge=1),
|
|
per_page: int = Query(20, ge=1, le=100),
|
|
session: AsyncSession = Depends(get_db),
|
|
):
|
|
docs = await storage.list_metadata(session, topic=topic)
|
|
total = len(docs)
|
|
start = (page - 1) * per_page
|
|
return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}
|
|
|
|
|
|
@router.get("/{doc_id}")
|
|
async def get_document(doc_id: str, session: AsyncSession = Depends(get_db)):
|
|
meta = await storage.get_metadata(session, doc_id)
|
|
if meta is None:
|
|
raise HTTPException(404, "Document not found")
|
|
return meta
|
|
|
|
|
|
@router.delete("/{doc_id}")
|
|
async def delete_document(doc_id: str, session: AsyncSession = Depends(get_db)):
|
|
ok = await storage.delete_document(session, doc_id)
|
|
if not ok:
|
|
raise HTTPException(404, "Document not found")
|
|
return {"success": True}
|
|
|
|
|
|
@router.post("/{doc_id}/classify")
|
|
async def classify_document(
|
|
doc_id: str,
|
|
body: dict = {},
|
|
session: AsyncSession = Depends(get_db),
|
|
):
|
|
meta = await storage.get_metadata(session, doc_id)
|
|
if meta is None:
|
|
raise HTTPException(404, "Document not found")
|
|
|
|
topic_names = body.get("topics") if body else None
|
|
try:
|
|
# The /classify endpoint calls classifier synchronously and returns the
|
|
# topic list immediately — this preserves the historical behavior.
|
|
# The upload-time path uses Celery .delay() instead (Phase 1 cutover).
|
|
topics = await classifier.classify_document(session, doc_id, topic_names)
|
|
except Exception as e:
|
|
raise HTTPException(500, f"Classification failed: {e}")
|
|
|
|
return {"topics": topics}
|