from datetime import datetime, timezone from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query from services import storage, extractor, classifier router = APIRouter(prefix="/api/documents", tags=["documents"]) ALLOWED_MIME_TYPES = { "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword", "text/plain", "text/markdown", "image/png", "image/jpeg", "image/jpg", "image/tiff", "image/webp", } @router.post("/upload") async def upload_document( file: UploadFile = File(...), auto_classify: bool = Form(True), ): content = await file.read() if len(content) == 0: raise HTTPException(400, "Empty file") mime = file.content_type or "application/octet-stream" saved = storage.save_upload(content, file.filename or "upload", mime) text = extractor.extract_text(saved["path"], mime) now = datetime.now(timezone.utc).isoformat() meta = { "id": saved["id"], "original_name": file.filename or "upload", "filename": saved["filename"], "mime_type": mime, "size_bytes": len(content), "extracted_text": text, "topics": [], "created_at": now, "classified_at": None, } storage.save_metadata(meta) if auto_classify: try: topics = await classifier.classify_document(saved["id"]) meta["topics"] = topics meta["classified_at"] = datetime.now(timezone.utc).isoformat() except Exception as e: # Classification failure is non-fatal; document is still saved meta["classification_error"] = str(e) return meta @router.get("") async def list_documents( topic: str | None = Query(None), page: int = Query(1, ge=1), per_page: int = Query(20, ge=1, le=100), ): docs = storage.list_metadata(topic=topic) total = len(docs) start = (page - 1) * per_page return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page} @router.get("/{doc_id}") async def get_document(doc_id: str): meta = storage.get_metadata(doc_id) if meta is None: raise HTTPException(404, "Document not found") return meta @router.delete("/{doc_id}") async def delete_document(doc_id: str): ok = storage.delete_document(doc_id) if not ok: raise HTTPException(404, "Document not found") return {"success": True} @router.post("/{doc_id}/classify") async def classify_document(doc_id: str, body: dict = {}): meta = storage.get_metadata(doc_id) if meta is None: raise HTTPException(404, "Document not found") topic_names = body.get("topics") if body else None try: topics = await classifier.classify_document(doc_id, topic_names) except Exception as e: raise HTTPException(500, f"Classification failed: {e}") return {"topics": topics}