""" Reads doc_service_config.json from the storage-service config bucket. 30-second TTL cache + env var overrides. Env var overrides (all optional): DOC_MAX_PDF_MB — max upload size in megabytes (e.g. "50") """ import os import time from copy import deepcopy import httpx from app.core.config import settings _CONFIG_KEY = "doc_service_config.json" _DEFAULT_STORAGE_CONFIG: dict = { "watch_enabled": False, "watch_path": "/data/watch", "ai_folder_suggestion": False, "ai_folder_default": "imports", "ai_rename_suggestion": False, } _DEFAULT_SYSTEM_PROMPT = ( "You are a financial document analysis assistant. " "Given the text extracted from a PDF document, return ONLY a JSON object " "with no markdown, no code fences, and no explanation." ) _DEFAULT_USER_TEMPLATE = ( 'Analyze the following document text and return a JSON object with exactly these keys:\n' 'title (a short, descriptive human-readable title for this document, e.g. "ACME Corp Invoice April 2026", "Office Supplies Receipt", "Q1 Flower Delivery Order"),\n' 'document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown),\n' 'total_amount (string or null),\n' 'currency (string or null),\n' 'vendor_name (string or null),\n' 'customer_name (string or null),\n' 'billing_address (string or null),\n' 'customer_address (string or null),\n' 'invoice_number (string or null),\n' 'invoice_date (string or null),\n' 'due_date (string or null),\n' 'tags (array of short keyword strings describing the document),\n' 'line_items (array of objects, each with keys: description, amount),\n' 'suggested_categories (array of 2 to 5 short category name strings a user might want to file this document under, e.g. "Utilities", "Travel", "Software Subscriptions", "Client Invoices").\n' '\n' 'Document text:\n' '{text}' ) _DEFAULT_CONFIG: dict = { "documents": {"max_pdf_bytes": 20 * 1024 * 1024}, "storage": _DEFAULT_STORAGE_CONFIG, "system_prompts": { "system": _DEFAULT_SYSTEM_PROMPT, "user_template": _DEFAULT_USER_TEMPLATE, }, } _cache: dict | None = None _cache_at: float = 0.0 _CACHE_TTL = 30.0 def _storage_url() -> str: return f"{settings.STORAGE_SERVICE_URL}/objects/config/{_CONFIG_KEY}" async def _fetch_config() -> dict: """Fetch config from storage-service. Returns defaults if not found.""" async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.get(_storage_url()) if resp.status_code == 404: return deepcopy(_DEFAULT_CONFIG) resp.raise_for_status() return resp.json() async def _write_config(data: dict) -> None: import json payload = json.dumps(data, indent=2).encode() async with httpx.AsyncClient(timeout=10.0) as client: resp = await client.put( _storage_url(), content=payload, headers={"Content-Type": "application/octet-stream"}, ) resp.raise_for_status() def _apply_env_overrides(config: dict) -> dict: cfg = deepcopy(config) docs = cfg.setdefault("documents", {}) if v := os.environ.get("DOC_MAX_PDF_MB"): try: docs["max_pdf_bytes"] = int(v) * 1024 * 1024 except ValueError: pass return cfg async def load_doc_config() -> dict: global _cache, _cache_at now = time.monotonic() if _cache is not None and (now - _cache_at) < _CACHE_TTL: return _cache raw = await _fetch_config() data = _apply_env_overrides(raw) _cache = data _cache_at = now return data async def get_storage_config() -> dict: """Return storage config block, filling in defaults for any missing keys.""" config = await load_doc_config() result = deepcopy(_DEFAULT_STORAGE_CONFIG) result.update(config.get("storage", {})) return result async def save_storage_config(data: dict) -> None: """Merge data into the storage config block and persist to storage-service.""" global _cache, _cache_at raw = await _fetch_config() raw.setdefault("storage", {}).update(data) await _write_config(raw) _cache = None _cache_at = 0.0