0d34867a69
- New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
209 lines
8.3 KiB
Python
209 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Security pre-commit checker.
|
|
Runs inside a Docker container — do not execute directly on the host.
|
|
|
|
Checks:
|
|
1. Hardcoded secrets / credentials in staged files
|
|
2. Dangerous patterns (eval, exec, shell=True, pickle)
|
|
3. Weak cryptography (MD5, SHA1 for passwords, DES)
|
|
4. SQL injection risk (f-strings / .format() / % in execute/query/text())
|
|
5. Missing input sanitization (raw request attributes passed to DB)
|
|
6. JWT vulnerabilities (algorithm=none, verify_exp=False, long-lived tokens)
|
|
7. Debug/development flags left in code
|
|
8. bandit static analysis on Python files
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# ── Patterns ─────────────────────────────────────────────────────────────────
|
|
|
|
SECRET_PATTERNS = [
|
|
# Only match lowercase/camelCase variable names — excludes ALL_CAPS test constants
|
|
(r'(?i)(?<![A-Z_])(password|passwd|secret|api_key|apikey|token|auth)(?![A-Z_])\s*=\s*["\'][^"\']{4,}["\']',
|
|
"possible hardcoded credential"),
|
|
(r'(?i)secret_key\s*=\s*["\'][^"\']{4,}["\'](?!.*change)',
|
|
"hardcoded secret_key (use env var)"),
|
|
(r'-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----',
|
|
"private key embedded in source"),
|
|
(r'(?i)(aws_access_key_id|aws_secret_access_key)\s*=\s*["\'][A-Za-z0-9/+]{16,}["\']',
|
|
"possible AWS credential"),
|
|
]
|
|
|
|
DANGEROUS_PATTERNS = [
|
|
(r'\beval\s*\(', "use of eval()"),
|
|
(r'\bexec\s*\(', "use of exec()"),
|
|
(r'subprocess\.[a-z_]+\(.*shell\s*=\s*True', "subprocess with shell=True"),
|
|
(r'pickle\.loads?\(', "use of pickle (deserialization risk)"),
|
|
(r'yaml\.load\s*\([^)]*\)(?!\s*#.*safe)', "yaml.load without Loader=yaml.SafeLoader"),
|
|
]
|
|
|
|
WEAK_CRYPTO_PATTERNS = [
|
|
(r'\bMD5\b|\bhashlib\.md5\b', "MD5 is not suitable for password hashing"),
|
|
(r'\bSHA1\b|\bhashlib\.sha1\b', "SHA1 is not suitable for password hashing"),
|
|
(r'\bDES\b(?!C)', "DES is a weak cipher"),
|
|
(r'random\.random\(\)|random\.randint\(', "use secrets module for security-sensitive randomness"),
|
|
]
|
|
|
|
SQL_INJECTION_PATTERNS = [
|
|
# f-string or .format() passed directly to execute/query
|
|
(r'(execute|query)\s*\(\s*f["\']',
|
|
"potential SQL injection: f-string passed to execute/query"),
|
|
(r'(execute|query)\s*\(.*\.format\s*\(',
|
|
"potential SQL injection: .format() passed to execute/query"),
|
|
(r'(execute|query)\s*\(.*%\s*[({]',
|
|
"potential SQL injection: %-formatting passed to execute/query"),
|
|
# SQLAlchemy text() used without bindparam / colon-style params
|
|
(r'\btext\s*\(\s*f["\']',
|
|
"SQLAlchemy text() with f-string — use bindparam() instead"),
|
|
(r'\btext\s*\(.*\.format\s*\(',
|
|
"SQLAlchemy text() with .format() — use bindparam() instead"),
|
|
# String concatenation into a variable named *query* or *sql*
|
|
(r'(sql|query)\s*[+]=\s*["\']',
|
|
"possible SQL string concatenation — use ORM or bindparam()"),
|
|
]
|
|
|
|
SANITIZATION_PATTERNS = [
|
|
# Pydantic str field without a validator on the same or adjacent line
|
|
# Flags 'str' fields in BaseModel subclasses that look unvalidated.
|
|
# Heuristic: detects direct assignment to session/db without going through a schema.
|
|
(r'\bdb\.(add|execute)\s*\(.*request\.',
|
|
"raw request attribute passed to DB — route through a Pydantic schema first"),
|
|
]
|
|
|
|
JWT_PATTERNS = [
|
|
# Algorithm confusion attack — accepting 'none' algorithm
|
|
(r'algorithms?\s*=\s*\[.*["\']none["\']',
|
|
"JWT algorithm 'none' accepted — algorithm confusion attack (Critical)"),
|
|
(r'algorithm\s*=\s*["\']none["\']',
|
|
"JWT algorithm set to 'none' — algorithm confusion attack (Critical)"),
|
|
# Disabling expiry verification
|
|
(r'verify_exp\s*[=:]\s*False',
|
|
"JWT expiry verification disabled — tokens never expire (Critical)"),
|
|
(r'options\s*=\s*\{[^}]*["\']verify_exp["\'].*False',
|
|
"JWT expiry verification disabled in options dict (Critical)"),
|
|
# Long-lived tokens: timedelta(days=...) in JWT context is suspicious
|
|
(r'timedelta\s*\(\s*days\s*=\s*[1-9]',
|
|
"JWT token with multi-day expiry — use hours, not days (High)"),
|
|
# Overly large EXPIRE_MINUTES constant (>9999 min ≈ 7 days)
|
|
(r'EXPIRE_MINUTES\s*[=:]\s*[1-9]\d{4,}',
|
|
"JWT EXPIRE_MINUTES value > 9999 (> 7 days) — reduce token lifetime (High)"),
|
|
# Hardcoded JWT secret
|
|
(r'SECRET_KEY\s*=\s*["\'][a-zA-Z0-9_\-]{4,}["\'](?!.*env|.*change)',
|
|
"possible hardcoded JWT secret — use env var (High)"),
|
|
]
|
|
|
|
DEBUG_PATTERNS = [
|
|
(r'\bdebug\s*=\s*True\b', "debug=True left in code"),
|
|
(r'print\s*\(.*password', "possible password printed to stdout"),
|
|
]
|
|
|
|
ALL_PATTERNS = (
|
|
[("SECRET", p, m) for p, m in SECRET_PATTERNS]
|
|
+ [("DANGER", p, m) for p, m in DANGEROUS_PATTERNS]
|
|
+ [("CRYPTO", p, m) for p, m in WEAK_CRYPTO_PATTERNS]
|
|
+ [("SQLINJ", p, m) for p, m in SQL_INJECTION_PATTERNS]
|
|
+ [("SANIT", p, m) for p, m in SANITIZATION_PATTERNS]
|
|
+ [("JWT", p, m) for p, m in JWT_PATTERNS]
|
|
+ [("DEBUG", p, m) for p, m in DEBUG_PATTERNS]
|
|
)
|
|
|
|
# Files/dirs to skip
|
|
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build", "changelog"}
|
|
SKIP_FILES = {"package-lock.json", "poetry.lock", "security_check.py"}
|
|
# Skip documentation and binary files — they may reference pattern keywords without risk
|
|
SKIP_EXTENSIONS = {
|
|
".md", ".txt", ".rst",
|
|
".png", ".jpg", ".jpeg", ".gif", ".ico",
|
|
".woff", ".woff2", ".ttf", ".eot",
|
|
}
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
def get_staged_files() -> list[Path]:
|
|
"""Read staged files from STAGED_FILES env var (set by the pre-commit hook)."""
|
|
raw = os.environ.get("STAGED_FILES", "")
|
|
files = []
|
|
for line in raw.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
p = Path(line)
|
|
if p.suffix in SKIP_EXTENSIONS:
|
|
continue
|
|
if any(part in SKIP_DIRS for part in p.parts):
|
|
continue
|
|
if p.name in SKIP_FILES:
|
|
continue
|
|
if p.exists():
|
|
files.append(p)
|
|
return files
|
|
|
|
|
|
def scan_file(path: Path) -> list[tuple[int, str, str]]:
|
|
findings = []
|
|
try:
|
|
content = path.read_text(errors="ignore")
|
|
except Exception:
|
|
return findings
|
|
|
|
for line_no, line in enumerate(content.splitlines(), start=1):
|
|
for category, pattern, message in ALL_PATTERNS:
|
|
if re.search(pattern, line):
|
|
# Skip lines that are clearly comments explaining the pattern
|
|
stripped = line.strip()
|
|
if stripped.startswith("#") or stripped.startswith("//"):
|
|
continue
|
|
findings.append((line_no, category, message))
|
|
return findings
|
|
|
|
|
|
def run_bandit(py_files: list[Path]) -> tuple[bool, str]:
|
|
if not py_files:
|
|
return True, ""
|
|
result = subprocess.run(
|
|
[sys.executable, "-m", "bandit", "-q", "-ll", "--", *[str(f) for f in py_files]],
|
|
capture_output=True, text=True
|
|
)
|
|
passed = result.returncode == 0
|
|
return passed, result.stdout + result.stderr
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
def main() -> int:
|
|
staged = get_staged_files()
|
|
if not staged:
|
|
print("[security] no staged files to check")
|
|
return 0
|
|
|
|
print(f"[security] scanning {len(staged)} staged file(s)...")
|
|
|
|
violations = 0
|
|
for path in staged:
|
|
findings = scan_file(path)
|
|
for line_no, category, message in findings:
|
|
print(f" [{category}] {path}:{line_no} — {message}")
|
|
violations += 1
|
|
|
|
py_files = [f for f in staged if f.suffix == ".py"]
|
|
bandit_ok, bandit_out = run_bandit(py_files)
|
|
if not bandit_ok:
|
|
print("\n[security] bandit found issues:")
|
|
print(bandit_out)
|
|
violations += 1
|
|
|
|
if violations:
|
|
print(f"\n[security] BLOCKED — {violations} issue(s) found. Fix them or use git commit --no-verify to override.")
|
|
return 1
|
|
|
|
print("[security] all checks passed.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|