Business-Management/scripts/security_check.py

#!/usr/bin/env python3
"""
Security pre-commit checker.
Runs inside a Docker container — do not execute directly on the host.

Checks:
  1. Hardcoded secrets / credentials in staged files
  2. Dangerous patterns (eval, exec, shell=True, pickle)
  3. Weak cryptography (MD5, SHA1 for passwords, DES)
  4. SQL injection risk (f-strings / .format() / % in execute/query/text())
  5. Missing input sanitization (raw request attributes passed to DB)
  6. JWT vulnerabilities (algorithm=none, verify_exp=False, long-lived tokens)
  7. Debug/development flags left in code
  8. bandit static analysis on Python files
"""

import os
import re
import subprocess
import sys
from pathlib import Path

# ── Patterns ─────────────────────────────────────────────────────────────────

SECRET_PATTERNS = [
    # Only match lowercase/camelCase variable names — excludes ALL_CAPS test constants
    (r'(?i)(?<![A-Z_])(password|passwd|secret|api_key|apikey|token|auth)(?![A-Z_])\s*=\s*["\'][^"\']{4,}["\']',
     "possible hardcoded credential"),
    (r'(?i)secret_key\s*=\s*["\'][^"\']{4,}["\'](?!.*change)',
     "hardcoded secret_key (use env var)"),
    (r'-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----',
     "private key embedded in source"),
    (r'(?i)(aws_access_key_id|aws_secret_access_key)\s*=\s*["\'][A-Za-z0-9/+]{16,}["\']',
     "possible AWS credential"),
]

DANGEROUS_PATTERNS = [
    (r'\beval\s*\(', "use of eval()"),
    (r'\bexec\s*\(', "use of exec()"),
    (r'subprocess\.[a-z_]+\(.*shell\s*=\s*True', "subprocess with shell=True"),
    (r'pickle\.loads?\(', "use of pickle (deserialization risk)"),
    (r'yaml\.load\s*\([^)]*\)(?!\s*#.*safe)', "yaml.load without Loader=yaml.SafeLoader"),
]

WEAK_CRYPTO_PATTERNS = [
    (r'\bMD5\b|\bhashlib\.md5\b', "MD5 is not suitable for password hashing"),
    (r'\bSHA1\b|\bhashlib\.sha1\b', "SHA1 is not suitable for password hashing"),
    (r'\bDES\b(?!C)', "DES is a weak cipher"),
    (r'random\.random\(\)|random\.randint\(', "use secrets module for security-sensitive randomness"),
]

SQL_INJECTION_PATTERNS = [
    # f-string or .format() passed directly to execute/query
    (r'(execute|query)\s*\(\s*f["\']',
     "potential SQL injection: f-string passed to execute/query"),
    (r'(execute|query)\s*\(.*\.format\s*\(',
     "potential SQL injection: .format() passed to execute/query"),
    (r'(execute|query)\s*\(.*%\s*[({]',
    "potential SQL injection: %-formatting passed to execute/query"),
    # SQLAlchemy text() used without bindparam / colon-style params
    (r'\btext\s*\(\s*f["\']',
     "SQLAlchemy text() with f-string — use bindparam() instead"),
    (r'\btext\s*\(.*\.format\s*\(',
     "SQLAlchemy text() with .format() — use bindparam() instead"),
    # String concatenation into a variable named *query* or *sql*
    (r'(sql|query)\s*[+]=\s*["\']',
     "possible SQL string concatenation — use ORM or bindparam()"),
]

SANITIZATION_PATTERNS = [
    # Pydantic str field without a validator on the same or adjacent line
    # Flags 'str' fields in BaseModel subclasses that look unvalidated.
    # Heuristic: detects direct assignment to session/db without going through a schema.
    (r'\bdb\.(add|execute)\s*\(.*request\.',
     "raw request attribute passed to DB — route through a Pydantic schema first"),
]

JWT_PATTERNS = [
    # Algorithm confusion attack — accepting 'none' algorithm
    (r'algorithms?\s*=\s*\[.*["\']none["\']',
     "JWT algorithm 'none' accepted — algorithm confusion attack (Critical)"),
    (r'algorithm\s*=\s*["\']none["\']',
     "JWT algorithm set to 'none' — algorithm confusion attack (Critical)"),
    # Disabling expiry verification
    (r'verify_exp\s*[=:]\s*False',
     "JWT expiry verification disabled — tokens never expire (Critical)"),
    (r'options\s*=\s*\{[^}]*["\']verify_exp["\'].*False',
     "JWT expiry verification disabled in options dict (Critical)"),
    # Long-lived tokens: timedelta(days=...) in JWT context is suspicious
    (r'timedelta\s*\(\s*days\s*=\s*[1-9]',
     "JWT token with multi-day expiry — use hours, not days (High)"),
    # Overly large EXPIRE_MINUTES constant (>9999 min ≈ 7 days)
    (r'EXPIRE_MINUTES\s*[=:]\s*[1-9]\d{4,}',
     "JWT EXPIRE_MINUTES value > 9999 (> 7 days) — reduce token lifetime (High)"),
    # Hardcoded JWT secret
    (r'SECRET_KEY\s*=\s*["\'][a-zA-Z0-9_\-]{4,}["\'](?!.*env|.*change)',
     "possible hardcoded JWT secret — use env var (High)"),
]

DEBUG_PATTERNS = [
    (r'\bdebug\s*=\s*True\b', "debug=True left in code"),
    (r'print\s*\(.*password', "possible password printed to stdout"),
]

ALL_PATTERNS = (
    [("SECRET", p, m) for p, m in SECRET_PATTERNS]
    + [("DANGER", p, m) for p, m in DANGEROUS_PATTERNS]
    + [("CRYPTO", p, m) for p, m in WEAK_CRYPTO_PATTERNS]
    + [("SQLINJ", p, m) for p, m in SQL_INJECTION_PATTERNS]
    + [("SANIT",  p, m) for p, m in SANITIZATION_PATTERNS]
    + [("JWT",    p, m) for p, m in JWT_PATTERNS]
    + [("DEBUG",  p, m) for p, m in DEBUG_PATTERNS]
)

# Files/dirs to skip
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build", "changelog"}
SKIP_FILES = {"package-lock.json", "poetry.lock", "security_check.py"}
# Skip documentation and binary files — they may reference pattern keywords without risk
SKIP_EXTENSIONS = {
    ".md", ".txt", ".rst",
    ".png", ".jpg", ".jpeg", ".gif", ".ico",
    ".woff", ".woff2", ".ttf", ".eot",
}

# ── Helpers ───────────────────────────────────────────────────────────────────

def get_staged_files() -> list[Path]:
    """Read staged files from STAGED_FILES env var (set by the pre-commit hook)."""
    raw = os.environ.get("STAGED_FILES", "")
    files = []
    for line in raw.splitlines():
        line = line.strip()
        if not line:
            continue
        p = Path(line)
        if p.suffix in SKIP_EXTENSIONS:
            continue
        if any(part in SKIP_DIRS for part in p.parts):
            continue
        if p.name in SKIP_FILES:
            continue
        if p.exists():
            files.append(p)
    return files


def scan_file(path: Path) -> list[tuple[int, str, str]]:
    findings = []
    try:
        content = path.read_text(errors="ignore")
    except Exception:
        return findings

    for line_no, line in enumerate(content.splitlines(), start=1):
        for category, pattern, message in ALL_PATTERNS:
            if re.search(pattern, line):
                # Skip lines that are clearly comments explaining the pattern
                stripped = line.strip()
                if stripped.startswith("#") or stripped.startswith("//"):
                    continue
                findings.append((line_no, category, message))
    return findings


def run_bandit(py_files: list[Path]) -> tuple[bool, str]:
    if not py_files:
        return True, ""
    result = subprocess.run(
        [sys.executable, "-m", "bandit", "-q", "-ll", "--", *[str(f) for f in py_files]],
        capture_output=True, text=True
    )
    passed = result.returncode == 0
    return passed, result.stdout + result.stderr

# ── Main ──────────────────────────────────────────────────────────────────────

def main() -> int:
    staged = get_staged_files()
    if not staged:
        print("[security] no staged files to check")
        return 0

    print(f"[security] scanning {len(staged)} staged file(s)...")

    violations = 0
    for path in staged:
        findings = scan_file(path)
        for line_no, category, message in findings:
            print(f"  [{category}] {path}:{line_no} — {message}")
            violations += 1

    py_files = [f for f in staged if f.suffix == ".py"]
    bandit_ok, bandit_out = run_bandit(py_files)
    if not bandit_ok:
        print("\n[security] bandit found issues:")
        print(bandit_out)
        violations += 1

    if violations:
        print(f"\n[security] BLOCKED — {violations} issue(s) found. Fix them or use git commit --no-verify to override.")
        return 1

    print("[security] all checks passed.")
    return 0


if __name__ == "__main__":
    sys.exit(main())