chore: initial commit — existing single-user document scanner codebase
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
Text extraction dispatcher.
|
||||
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
MAX_STORED_CHARS = 50_000
|
||||
|
||||
|
||||
def extract_text(file_path: str, mime_type: str) -> str:
|
||||
path = Path(file_path)
|
||||
try:
|
||||
if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
|
||||
return _extract_pdf(path)
|
||||
elif mime_type in (
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/msword",
|
||||
) or path.suffix.lower() in (".docx", ".doc"):
|
||||
return _extract_docx(path)
|
||||
elif mime_type and mime_type.startswith("image/"):
|
||||
return _extract_image(path)
|
||||
else:
|
||||
return _extract_text_file(path)
|
||||
except Exception as e:
|
||||
return f"[Extraction error: {e}]"
|
||||
|
||||
|
||||
def _extract_pdf(path: Path) -> str:
|
||||
import fitz # PyMuPDF
|
||||
doc = fitz.open(str(path))
|
||||
pages = []
|
||||
for page in doc:
|
||||
pages.append(page.get_text())
|
||||
doc.close()
|
||||
return _truncate("\n".join(pages))
|
||||
|
||||
|
||||
def _extract_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
doc = Document(str(path))
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
return _truncate("\n".join(paragraphs))
|
||||
|
||||
|
||||
def _extract_image(path: Path) -> str:
|
||||
try:
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
img = Image.open(str(path))
|
||||
text = pytesseract.image_to_string(img)
|
||||
return _truncate(text)
|
||||
except ImportError:
|
||||
return "[OCR unavailable: pytesseract or Pillow not installed]"
|
||||
except Exception as e:
|
||||
return f"[OCR error: {e}]"
|
||||
|
||||
|
||||
def _extract_text_file(path: Path) -> str:
|
||||
for enc in ("utf-8", "latin-1", "cp1252"):
|
||||
try:
|
||||
return _truncate(path.read_text(encoding=enc))
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return "[Could not decode text file]"
|
||||
|
||||
|
||||
def _truncate(text: str) -> str:
|
||||
text = text.strip()
|
||||
if len(text) > MAX_STORED_CHARS:
|
||||
text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
|
||||
return text
|
||||
Reference in New Issue
Block a user