feat: migrate doc-service to use storage-service for file I/O (Phase 2)
- storage.py: replace aiofiles filesystem ops with httpx calls to
storage-service PUT/GET/DELETE /objects/documents/{key}
- Document model: rename file_path → storage_key (plain object key, no path prefix)
- Migration 0008: ALTER COLUMN + data migration strips /data/documents/ prefix
- documents.py: update upload, delete, download endpoints; _extract_pdf_text
now takes bytes (pdfplumber.open(BytesIO)) instead of a filesystem path
- file_watcher.py: store storage_key instead of file_path on ingestion
- doc-service config: add STORAGE_SERVICE_URL env var
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ File-system watcher for the watch directory.
|
||||
|
||||
Uses the watchdog library to monitor a configured directory for new PDF files.
|
||||
When a PDF is detected, it is automatically ingested into the document service
|
||||
(copied to /data/documents, a DB record is created, and the AI pipeline runs).
|
||||
(uploaded to storage-service, a DB record is created, and the AI pipeline runs).
|
||||
|
||||
Key design decisions:
|
||||
- No-remove policy: on_deleted and on_moved events are intentionally ignored.
|
||||
@@ -82,13 +82,13 @@ async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
|
||||
logger.warning("[watcher] Cannot read %s: %s", path_str, exc)
|
||||
return
|
||||
|
||||
# Save a copy to /data/documents/watch/{doc_id}.pdf
|
||||
# Upload to storage-service under documents/watch/{doc_id}.pdf
|
||||
doc_id = existing.id if existing is not None else str(uuid.uuid4())
|
||||
dest = await save_upload(file_data, WATCH_USER_ID, doc_id)
|
||||
storage_key = await save_upload(file_data, WATCH_USER_ID, doc_id)
|
||||
|
||||
if existing is not None:
|
||||
# Re-ingest a previously failed document
|
||||
existing.file_path = str(dest)
|
||||
existing.storage_key = storage_key
|
||||
existing.file_size = len(file_data)
|
||||
existing.status = "pending"
|
||||
existing.error_message = None
|
||||
@@ -100,7 +100,7 @@ async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
|
||||
source="watch",
|
||||
watch_path=path_str,
|
||||
filename=path.name,
|
||||
file_path=str(dest),
|
||||
storage_key=storage_key,
|
||||
file_size=len(file_data),
|
||||
status="pending",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user