kite/backend/api/documents.py

from datetime import datetime, timezone
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query
from services import storage, extractor, classifier

router = APIRouter(prefix="/api/documents", tags=["documents"])

ALLOWED_MIME_TYPES = {
    "application/pdf",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "application/msword",
    "text/plain",
    "text/markdown",
    "image/png",
    "image/jpeg",
    "image/jpg",
    "image/tiff",
    "image/webp",
}


@router.post("/upload")
async def upload_document(
    file: UploadFile = File(...),
    auto_classify: bool = Form(True),
):
    content = await file.read()
    if len(content) == 0:
        raise HTTPException(400, "Empty file")

    mime = file.content_type or "application/octet-stream"

    saved = storage.save_upload(content, file.filename or "upload", mime)
    text = extractor.extract_text(saved["path"], mime)

    now = datetime.now(timezone.utc).isoformat()
    meta = {
        "id": saved["id"],
        "original_name": file.filename or "upload",
        "filename": saved["filename"],
        "mime_type": mime,
        "size_bytes": len(content),
        "extracted_text": text,
        "topics": [],
        "created_at": now,
        "classified_at": None,
    }
    storage.save_metadata(meta)

    if auto_classify:
        try:
            topics = await classifier.classify_document(saved["id"])
            meta["topics"] = topics
            meta["classified_at"] = datetime.now(timezone.utc).isoformat()
        except Exception as e:
            # Classification failure is non-fatal; document is still saved
            meta["classification_error"] = str(e)

    return meta


@router.get("")
async def list_documents(
    topic: str | None = Query(None),
    page: int = Query(1, ge=1),
    per_page: int = Query(20, ge=1, le=100),
):
    docs = storage.list_metadata(topic=topic)
    total = len(docs)
    start = (page - 1) * per_page
    return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}


@router.get("/{doc_id}")
async def get_document(doc_id: str):
    meta = storage.get_metadata(doc_id)
    if meta is None:
        raise HTTPException(404, "Document not found")
    return meta


@router.delete("/{doc_id}")
async def delete_document(doc_id: str):
    ok = storage.delete_document(doc_id)
    if not ok:
        raise HTTPException(404, "Document not found")
    return {"success": True}


@router.post("/{doc_id}/classify")
async def classify_document(doc_id: str, body: dict = {}):
    meta = storage.get_metadata(doc_id)
    if meta is None:
        raise HTTPException(404, "Document not found")

    topic_names = body.get("topics") if body else None
    try:
        topics = await classifier.classify_document(doc_id, topic_names)
    except Exception as e:
        raise HTTPException(500, f"Classification failed: {e}")

    return {"topics": topics}