chore: initial commit — existing single-user document scanner codebase

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-22 08:53:28 +02:00
parent 6fed5ba531
commit 7a34807fa0
71 changed files with 16408 additions and 0 deletions
@@ -0,0 +1,71 @@
+"""
+Text extraction dispatcher.
+Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
+"""
+from pathlib import Path
+
+MAX_STORED_CHARS = 50_000
+
+
+def extract_text(file_path: str, mime_type: str) -> str:
+    path = Path(file_path)
+    try:
+        if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
+            return _extract_pdf(path)
+        elif mime_type in (
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/msword",
+        ) or path.suffix.lower() in (".docx", ".doc"):
+            return _extract_docx(path)
+        elif mime_type and mime_type.startswith("image/"):
+            return _extract_image(path)
+        else:
+            return _extract_text_file(path)
+    except Exception as e:
+        return f"[Extraction error: {e}]"
+
+
+def _extract_pdf(path: Path) -> str:
+    import fitz  # PyMuPDF
+    doc = fitz.open(str(path))
+    pages = []
+    for page in doc:
+        pages.append(page.get_text())
+    doc.close()
+    return _truncate("\n".join(pages))
+
+
+def _extract_docx(path: Path) -> str:
+    from docx import Document
+    doc = Document(str(path))
+    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+    return _truncate("\n".join(paragraphs))
+
+
+def _extract_image(path: Path) -> str:
+    try:
+        from PIL import Image
+        import pytesseract
+        img = Image.open(str(path))
+        text = pytesseract.image_to_string(img)
+        return _truncate(text)
+    except ImportError:
+        return "[OCR unavailable: pytesseract or Pillow not installed]"
+    except Exception as e:
+        return f"[OCR error: {e}]"
+
+
+def _extract_text_file(path: Path) -> str:
+    for enc in ("utf-8", "latin-1", "cp1252"):
+        try:
+            return _truncate(path.read_text(encoding=enc))
+        except UnicodeDecodeError:
+            continue
+    return "[Could not decode text file]"
+
+
+def _truncate(text: str) -> str:
+    text = text.strip()
+    if len(text) > MAX_STORED_CHARS:
+        text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
+    return text