6849ebd1e6
- config.py: Remove SETTINGS_FILE, DEFAULT_SYSTEM_PROMPT, DEFAULT_SETTINGS constants; add system_prompt, default_ai_provider, default_ai_model to Settings - services/classifier.py: Add _DEFAULT_SYSTEM_PROMPT module constant; classify_document and suggest_topics_for_document accept ai_provider/ai_model kwargs; no longer calls storage.load_settings() — uses app_settings defaults with DB-supplied overrides (D-14, D-15) - services/storage.py: Delete load_settings, save_settings, mask_api_key, settings_masked; remove from __all__; remove import copy, json, DEFAULT_SETTINGS, SETTINGS_FILE (D-12) - tasks/document_tasks.py: _run resolves user.ai_provider/ai_model via session.get(User, doc.user_id) and passes through to classifier; task signature unchanged (T-03-19) - api/settings.py: Deleted — /api/settings endpoint removed (D-12) - main.py: Remove settings_router import and include_router call - tests/test_settings.py: Replace all tests with test_settings_endpoint_removed (404, green) - tests/test_classifier.py: Implement test_per_user_provider, test_celery_task_uses_user_provider, test_default_provider_fallback; remove xfail markers (DOC-03, DOC-05)
128 lines
5.2 KiB
Python
128 lines
5.2 KiB
Python
"""
|
|
Classification orchestrator.
|
|
Loads settings, selects AI provider, classifies document, auto-creates suggested topics.
|
|
|
|
Updated in Plan 05: classify_document and suggest_topics_for_document now accept
|
|
an AsyncSession as their first argument so they can be called from the Celery task
|
|
wrapper and from API route handlers that already hold a session.
|
|
|
|
Updated in Plan 03-03: classify_document uses load_topics_for_user (D-17) to scope
|
|
topic lookup to the document owner's namespace, and creates AI-suggested topics in
|
|
the user's namespace via create_topic(user_id=doc.user_id) (D-11).
|
|
|
|
Updated in Plan 03-04: classify_document and suggest_topics_for_document now accept
|
|
ai_provider and ai_model kwargs. No longer calls storage.load_settings(). Provider
|
|
resolved via get_provider() using per-user settings from DB (D-14, D-15).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import uuid as _uuid
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from config import settings as app_settings
|
|
from db.models import Document
|
|
from services import storage
|
|
from ai import get_provider
|
|
|
|
MAX_AI_CHARS = 8_000
|
|
|
|
_DEFAULT_SYSTEM_PROMPT = """You are a document classification assistant. When given a document's text content and a list of existing topics, you must:
|
|
1. Assign the document to one or more relevant topics from the list.
|
|
2. If no existing topics fit well, suggest new topic names.
|
|
Return ONLY valid JSON in this exact format, with no additional text or explanation:
|
|
{"assigned_topics": ["topic1"], "new_topic_suggestions": ["new topic name"]}
|
|
If the document fits no topics and you have no suggestions, return: {"assigned_topics": [], "new_topic_suggestions": []}"""
|
|
|
|
|
|
async def classify_document(
|
|
session: AsyncSession,
|
|
doc_id: str,
|
|
topic_names: list[str] | None = None,
|
|
ai_provider: str | None = None,
|
|
ai_model: str | None = None,
|
|
) -> list[str]:
|
|
"""
|
|
Classify a document by its ID. Returns the list of assigned topic names.
|
|
If topic_names is provided, restrict classification to those topics.
|
|
Auto-creates any newly suggested topics in the document owner's namespace (D-11).
|
|
|
|
ai_provider and ai_model come from the document owner's User record (D-14).
|
|
Falls back to app_settings.default_ai_provider / default_ai_model when None (D-15).
|
|
"""
|
|
meta = await storage.get_metadata(session, doc_id)
|
|
if meta is None:
|
|
raise ValueError(f"Document {doc_id} not found")
|
|
|
|
_ai_provider = ai_provider or app_settings.default_ai_provider
|
|
_ai_model = ai_model or app_settings.default_ai_model
|
|
system_prompt = app_settings.system_prompt or _DEFAULT_SYSTEM_PROMPT
|
|
_settings = {
|
|
"active_provider": _ai_provider,
|
|
"providers": {_ai_provider: {"model": _ai_model}},
|
|
}
|
|
provider = get_provider(_settings)
|
|
|
|
# Load the Document ORM object to get the owner's user_id (D-11, D-17)
|
|
try:
|
|
uid = _uuid.UUID(doc_id)
|
|
except ValueError:
|
|
uid = None
|
|
|
|
doc = await session.get(Document, uid) if uid is not None else None
|
|
doc_user_id = doc.user_id if doc is not None else None
|
|
|
|
# Use namespace-scoped topic list if not specified (D-17)
|
|
if topic_names is None:
|
|
if doc_user_id is not None:
|
|
all_topics = await storage.load_topics_for_user(session, user_id=doc_user_id)
|
|
else:
|
|
# Fallback for documents without a user (legacy / test data)
|
|
all_topics = await storage.load_topics(session)
|
|
topic_names = [t["name"] for t in all_topics]
|
|
|
|
text = meta.get("extracted_text", "")
|
|
result = await provider.classify(text[:MAX_AI_CHARS], topic_names, system_prompt)
|
|
|
|
# Collect all topic names to persist (assigned + suggested)
|
|
all_new_names = set(result.suggested_new_topics) | set(result.topics)
|
|
|
|
# Auto-create any topic not already in the registry — in the user's namespace (D-11)
|
|
existing_names = {t.lower() for t in topic_names}
|
|
for name in all_new_names:
|
|
if name.strip() and name.lower() not in existing_names:
|
|
await storage.create_topic(session, name.strip(), user_id=doc_user_id)
|
|
|
|
# Final list: everything the AI assigned or suggested
|
|
final_topics = [t for t in list(set(result.topics + result.suggested_new_topics)) if t.strip()]
|
|
|
|
await storage.update_document_topics(session, doc_id, final_topics)
|
|
return final_topics
|
|
|
|
|
|
async def suggest_topics_for_document(
|
|
session: AsyncSession,
|
|
doc_id: str,
|
|
ai_provider: str | None = None,
|
|
ai_model: str | None = None,
|
|
) -> list[str]:
|
|
"""Return AI-suggested topic names without modifying the document.
|
|
|
|
ai_provider and ai_model come from the document owner's User record (D-14).
|
|
Falls back to app_settings.default_ai_provider / default_ai_model when None (D-15).
|
|
"""
|
|
meta = await storage.get_metadata(session, doc_id)
|
|
if meta is None:
|
|
raise ValueError(f"Document {doc_id} not found")
|
|
|
|
_ai_provider = ai_provider or app_settings.default_ai_provider
|
|
_ai_model = ai_model or app_settings.default_ai_model
|
|
system_prompt = app_settings.system_prompt or _DEFAULT_SYSTEM_PROMPT
|
|
_settings = {
|
|
"active_provider": _ai_provider,
|
|
"providers": {_ai_provider: {"model": _ai_model}},
|
|
}
|
|
provider = get_provider(_settings)
|
|
text = meta.get("extracted_text", "")
|
|
return await provider.suggest_topics(text[:MAX_AI_CHARS], system_prompt)
|