Business-Management/backend/app/core/sanitize.py

"""
Input sanitization utilities.

Every string that originates from user input and is destined for the database
MUST pass through these helpers before reaching a SQLAlchemy model or query.
SQLAlchemy's ORM already uses bound parameters (no raw SQL), so these helpers
address the layer above: ensuring data is well-formed, length-capped, and free
of null bytes or control characters before it is stored.
"""

import re
import unicodedata
from datetime import date

# ── Constants ─────────────────────────────────────────────────────────────────

_PHONE_RE = re.compile(r"^\+?[\d\s\-()\[\]]{7,20}$")

# ── Core helper ───────────────────────────────────────────────────────────────


def sanitize_str(value: str | None, max_len: int = 255) -> str | None:
    """Strip whitespace, reject null bytes and non-printable control characters,
    enforce a maximum length.  Returns None unchanged so optional fields work
    naturally with ``Optional[str]`` annotations."""
    if value is None:
        return None

    # Strip leading/trailing whitespace
    value = value.strip()

    # Reject null bytes (common injection vector)
    if "\x00" in value:
        raise ValueError("Input must not contain null bytes")

    # Reject ASCII control characters (0x01–0x1F, 0x7F) except tab/newline/CR
    # which may appear in multi-line fields.  Use Unicode category 'Cc'.
    for ch in value:
        if unicodedata.category(ch) == "Cc" and ch not in ("\t", "\n", "\r"):
            raise ValueError("Input contains invalid control characters")

    if len(value) > max_len:
        raise ValueError(f"Input must not exceed {max_len} characters")

    return value if value != "" else None


def normalize_email(value: str) -> str:
    """Lowercase and strip an email address."""
    return value.strip().lower()


def validate_phone(value: str | None) -> str | None:
    """Sanitize then validate phone number format."""
    value = sanitize_str(value, max_len=20)
    if value is None:
        return None
    if not _PHONE_RE.match(value):
        raise ValueError(
            "Phone number may only contain digits, spaces, +, -, (, ) and [ ] "
            "and must be 7–20 characters"
        )
    return value


def validate_date_of_birth(value: date | None) -> date | None:
    """Reject obviously invalid birth dates (before 1900 or in the future)."""
    if value is None:
        return None
    if value.year < 1900:
        raise ValueError("Date of birth must be 1900 or later")
    if value > date.today():
        raise ValueError("Date of birth must not be in the future")
    return value