chore: initial commit — existing single-user document scanner codebase

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-05-22 08:53:28 +02:00
parent 6fed5ba531
commit 7a34807fa0
71 changed files with 16408 additions and 0 deletions
+71
View File
@@ -0,0 +1,71 @@
"""
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
from pathlib import Path
MAX_STORED_CHARS = 50_000
def extract_text(file_path: str, mime_type: str) -> str:
path = Path(file_path)
try:
if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
return _extract_pdf(path)
elif mime_type in (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
) or path.suffix.lower() in (".docx", ".doc"):
return _extract_docx(path)
elif mime_type and mime_type.startswith("image/"):
return _extract_image(path)
else:
return _extract_text_file(path)
except Exception as e:
return f"[Extraction error: {e}]"
def _extract_pdf(path: Path) -> str:
import fitz # PyMuPDF
doc = fitz.open(str(path))
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return _truncate("\n".join(pages))
def _extract_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return _truncate("\n".join(paragraphs))
def _extract_image(path: Path) -> str:
try:
from PIL import Image
import pytesseract
img = Image.open(str(path))
text = pytesseract.image_to_string(img)
return _truncate(text)
except ImportError:
return "[OCR unavailable: pytesseract or Pillow not installed]"
except Exception as e:
return f"[OCR error: {e}]"
def _extract_text_file(path: Path) -> str:
for enc in ("utf-8", "latin-1", "cp1252"):
try:
return _truncate(path.read_text(encoding=enc))
except UnicodeDecodeError:
continue
return "[Could not decode text file]"
def _truncate(text: str) -> str:
text = text.strip()
if len(text) > MAX_STORED_CHARS:
text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
return text