chore: initial commit — existing single-user document scanner codebase

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-22 08:53:28 +02:00
parent 6fed5ba531
commit 7a34807fa0
71 changed files with 16408 additions and 0 deletions
@@ -0,0 +1,101 @@
+from datetime import datetime, timezone
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query
+from services import storage, extractor, classifier
+
+router = APIRouter(prefix="/api/documents", tags=["documents"])
+
+ALLOWED_MIME_TYPES = {
+    "application/pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "application/msword",
+    "text/plain",
+    "text/markdown",
+    "image/png",
+    "image/jpeg",
+    "image/jpg",
+    "image/tiff",
+    "image/webp",
+}
+
+
+@router.post("/upload")
+async def upload_document(
+    file: UploadFile = File(...),
+    auto_classify: bool = Form(True),
+):
+    content = await file.read()
+    if len(content) == 0:
+        raise HTTPException(400, "Empty file")
+
+    mime = file.content_type or "application/octet-stream"
+
+    saved = storage.save_upload(content, file.filename or "upload", mime)
+    text = extractor.extract_text(saved["path"], mime)
+
+    now = datetime.now(timezone.utc).isoformat()
+    meta = {
+        "id": saved["id"],
+        "original_name": file.filename or "upload",
+        "filename": saved["filename"],
+        "mime_type": mime,
+        "size_bytes": len(content),
+        "extracted_text": text,
+        "topics": [],
+        "created_at": now,
+        "classified_at": None,
+    }
+    storage.save_metadata(meta)
+
+    if auto_classify:
+        try:
+            topics = await classifier.classify_document(saved["id"])
+            meta["topics"] = topics
+            meta["classified_at"] = datetime.now(timezone.utc).isoformat()
+        except Exception as e:
+            # Classification failure is non-fatal; document is still saved
+            meta["classification_error"] = str(e)
+
+    return meta
+
+
+@router.get("")
+async def list_documents(
+    topic: str | None = Query(None),
+    page: int = Query(1, ge=1),
+    per_page: int = Query(20, ge=1, le=100),
+):
+    docs = storage.list_metadata(topic=topic)
+    total = len(docs)
+    start = (page - 1) * per_page
+    return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}
+
+
+@router.get("/{doc_id}")
+async def get_document(doc_id: str):
+    meta = storage.get_metadata(doc_id)
+    if meta is None:
+        raise HTTPException(404, "Document not found")
+    return meta
+
+
+@router.delete("/{doc_id}")
+async def delete_document(doc_id: str):
+    ok = storage.delete_document(doc_id)
+    if not ok:
+        raise HTTPException(404, "Document not found")
+    return {"success": True}
+
+
+@router.post("/{doc_id}/classify")
+async def classify_document(doc_id: str, body: dict = {}):
+    meta = storage.get_metadata(doc_id)
+    if meta is None:
+        raise HTTPException(404, "Document not found")
+
+    topic_names = body.get("topics") if body else None
+    try:
+        topics = await classifier.classify_document(doc_id, topic_names)
+    except Exception as e:
+        raise HTTPException(500, f"Classification failed: {e}")
+
+    return {"topics": topics}