Files
kite/backend/services/classifier.py
T
curo1305 6849ebd1e6 feat(03-04): retire flat-file settings; wire per-user AI config via DB lookup
- config.py: Remove SETTINGS_FILE, DEFAULT_SYSTEM_PROMPT, DEFAULT_SETTINGS
  constants; add system_prompt, default_ai_provider, default_ai_model to Settings
- services/classifier.py: Add _DEFAULT_SYSTEM_PROMPT module constant; classify_document
  and suggest_topics_for_document accept ai_provider/ai_model kwargs; no longer calls
  storage.load_settings() — uses app_settings defaults with DB-supplied overrides (D-14, D-15)
- services/storage.py: Delete load_settings, save_settings, mask_api_key, settings_masked;
  remove from __all__; remove import copy, json, DEFAULT_SETTINGS, SETTINGS_FILE (D-12)
- tasks/document_tasks.py: _run resolves user.ai_provider/ai_model via session.get(User,
  doc.user_id) and passes through to classifier; task signature unchanged (T-03-19)
- api/settings.py: Deleted — /api/settings endpoint removed (D-12)
- main.py: Remove settings_router import and include_router call
- tests/test_settings.py: Replace all tests with test_settings_endpoint_removed (404, green)
- tests/test_classifier.py: Implement test_per_user_provider, test_celery_task_uses_user_provider,
  test_default_provider_fallback; remove xfail markers (DOC-03, DOC-05)
2026-05-23 20:32:55 +02:00

128 lines
5.2 KiB
Python

"""
Classification orchestrator.
Loads settings, selects AI provider, classifies document, auto-creates suggested topics.
Updated in Plan 05: classify_document and suggest_topics_for_document now accept
an AsyncSession as their first argument so they can be called from the Celery task
wrapper and from API route handlers that already hold a session.
Updated in Plan 03-03: classify_document uses load_topics_for_user (D-17) to scope
topic lookup to the document owner's namespace, and creates AI-suggested topics in
the user's namespace via create_topic(user_id=doc.user_id) (D-11).
Updated in Plan 03-04: classify_document and suggest_topics_for_document now accept
ai_provider and ai_model kwargs. No longer calls storage.load_settings(). Provider
resolved via get_provider() using per-user settings from DB (D-14, D-15).
"""
from __future__ import annotations
import uuid as _uuid
from sqlalchemy.ext.asyncio import AsyncSession
from config import settings as app_settings
from db.models import Document
from services import storage
from ai import get_provider
MAX_AI_CHARS = 8_000
_DEFAULT_SYSTEM_PROMPT = """You are a document classification assistant. When given a document's text content and a list of existing topics, you must:
1. Assign the document to one or more relevant topics from the list.
2. If no existing topics fit well, suggest new topic names.
Return ONLY valid JSON in this exact format, with no additional text or explanation:
{"assigned_topics": ["topic1"], "new_topic_suggestions": ["new topic name"]}
If the document fits no topics and you have no suggestions, return: {"assigned_topics": [], "new_topic_suggestions": []}"""
async def classify_document(
session: AsyncSession,
doc_id: str,
topic_names: list[str] | None = None,
ai_provider: str | None = None,
ai_model: str | None = None,
) -> list[str]:
"""
Classify a document by its ID. Returns the list of assigned topic names.
If topic_names is provided, restrict classification to those topics.
Auto-creates any newly suggested topics in the document owner's namespace (D-11).
ai_provider and ai_model come from the document owner's User record (D-14).
Falls back to app_settings.default_ai_provider / default_ai_model when None (D-15).
"""
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
_ai_provider = ai_provider or app_settings.default_ai_provider
_ai_model = ai_model or app_settings.default_ai_model
system_prompt = app_settings.system_prompt or _DEFAULT_SYSTEM_PROMPT
_settings = {
"active_provider": _ai_provider,
"providers": {_ai_provider: {"model": _ai_model}},
}
provider = get_provider(_settings)
# Load the Document ORM object to get the owner's user_id (D-11, D-17)
try:
uid = _uuid.UUID(doc_id)
except ValueError:
uid = None
doc = await session.get(Document, uid) if uid is not None else None
doc_user_id = doc.user_id if doc is not None else None
# Use namespace-scoped topic list if not specified (D-17)
if topic_names is None:
if doc_user_id is not None:
all_topics = await storage.load_topics_for_user(session, user_id=doc_user_id)
else:
# Fallback for documents without a user (legacy / test data)
all_topics = await storage.load_topics(session)
topic_names = [t["name"] for t in all_topics]
text = meta.get("extracted_text", "")
result = await provider.classify(text[:MAX_AI_CHARS], topic_names, system_prompt)
# Collect all topic names to persist (assigned + suggested)
all_new_names = set(result.suggested_new_topics) | set(result.topics)
# Auto-create any topic not already in the registry — in the user's namespace (D-11)
existing_names = {t.lower() for t in topic_names}
for name in all_new_names:
if name.strip() and name.lower() not in existing_names:
await storage.create_topic(session, name.strip(), user_id=doc_user_id)
# Final list: everything the AI assigned or suggested
final_topics = [t for t in list(set(result.topics + result.suggested_new_topics)) if t.strip()]
await storage.update_document_topics(session, doc_id, final_topics)
return final_topics
async def suggest_topics_for_document(
session: AsyncSession,
doc_id: str,
ai_provider: str | None = None,
ai_model: str | None = None,
) -> list[str]:
"""Return AI-suggested topic names without modifying the document.
ai_provider and ai_model come from the document owner's User record (D-14).
Falls back to app_settings.default_ai_provider / default_ai_model when None (D-15).
"""
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
_ai_provider = ai_provider or app_settings.default_ai_provider
_ai_model = ai_model or app_settings.default_ai_model
system_prompt = app_settings.system_prompt or _DEFAULT_SYSTEM_PROMPT
_settings = {
"active_provider": _ai_provider,
"providers": {_ai_provider: {"model": _ai_model}},
}
provider = get_provider(_settings)
text = meta.get("extracted_text", "")
return await provider.suggest_topics(text[:MAX_AI_CHARS], system_prompt)