a548266461
- Add backend/ai/utils.py — parse_classification, parse_suggestions, strip_code_fences shared by all AI providers; removes duplicated private functions from anthropic_provider.py and openai_provider.py - Add backend/deps/utils.py — get_client_ip, parse_uuid request-parsing helpers; removes local _ip() variants from admin.py, auth.py, shares.py, folders.py - Add backend/storage/exceptions.py — canonical CloudConnectionError definition; all routers and backends import from here instead of redefining - Move validate_password_strength to backend/services/auth.py; removes duplicated _validate_password_strength from admin.py and auth.py Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
243 lines
9.5 KiB
Python
243 lines
9.5 KiB
Python
"""
|
|
Google Drive v3 StorageBackend implementation for DocuVault.
|
|
|
|
Design notes:
|
|
- All sync googleapiclient calls are wrapped in asyncio.to_thread() to
|
|
avoid blocking the FastAPI event loop (Pitfall 7 — google-api-python-client
|
|
is synchronous; every .execute() call makes an HTTP request).
|
|
- cache_discovery=False on build() prevents the client library from writing
|
|
a JSON discovery document to /tmp, which would be a directory traversal
|
|
risk (T-05-03-05).
|
|
- D-14: presigned_get_url and generate_presigned_put_url raise
|
|
NotImplementedError. The API upload endpoint detects cloud backends and
|
|
uses the direct put_object() path instead.
|
|
- B2 design: This backend is STATELESS. It raises CloudConnectionError but
|
|
does NOT update the DB or CloudConnection objects. DB state transitions
|
|
(e.g., REQUIRES_REAUTH) are handled by the _call_cloud_op() helper in
|
|
cloud.py (Plan 05-05), which has the DB session.
|
|
- Token key format stored in credentials dict:
|
|
access_token — current OAuth bearer token
|
|
refresh_token — long-lived refresh token
|
|
token_uri — OAuth token endpoint
|
|
client_id — OAuth client ID
|
|
client_secret — OAuth client secret
|
|
expiry — ISO 8601 string (optional)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import datetime
|
|
import io
|
|
import uuid
|
|
|
|
from google.auth.transport.requests import Request
|
|
from google.oauth2.credentials import Credentials
|
|
from googleapiclient.discovery import build
|
|
from googleapiclient.errors import HttpError
|
|
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
|
|
|
|
from storage.base import StorageBackend
|
|
from storage.exceptions import CloudConnectionError # noqa: F401 re-exported for import compatibility
|
|
|
|
|
|
class GoogleDriveBackend(StorageBackend):
|
|
"""Google Drive v3 implementation of StorageBackend.
|
|
|
|
Every sync googleapiclient call is wrapped in asyncio.to_thread() (Pitfall 7).
|
|
The backend is stateless — it raises CloudConnectionError on token issues
|
|
but never writes to the DB.
|
|
"""
|
|
|
|
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
|
|
|
|
def __init__(self, credentials: dict) -> None:
|
|
"""Initialise with a decrypted credentials dict.
|
|
|
|
Expected keys:
|
|
access_token str — current OAuth bearer token
|
|
refresh_token str — long-lived refresh token (may be None)
|
|
token_uri str — defaults to https://oauth2.googleapis.com/token
|
|
client_id str
|
|
client_secret str
|
|
expiry str — ISO 8601 datetime string (optional)
|
|
"""
|
|
self._creds_dict = credentials
|
|
self._creds = self._dict_to_google_creds(credentials)
|
|
|
|
# ── Internal helpers ──────────────────────────────────────────────────────
|
|
|
|
def _dict_to_google_creds(self, d: dict) -> Credentials:
|
|
"""Build a google.oauth2.credentials.Credentials object from a stored dict."""
|
|
creds = Credentials(
|
|
token=d["access_token"],
|
|
refresh_token=d.get("refresh_token"),
|
|
token_uri=d.get("token_uri", "https://oauth2.googleapis.com/token"),
|
|
client_id=d.get("client_id"),
|
|
client_secret=d.get("client_secret"),
|
|
)
|
|
if d.get("expiry"):
|
|
creds.expiry = datetime.datetime.fromisoformat(d["expiry"])
|
|
return creds
|
|
|
|
def _get_service(self):
|
|
"""Build the Drive v3 service object.
|
|
|
|
cache_discovery=False prevents the client library from caching the
|
|
discovery document in /tmp — important for security (T-05-03-05).
|
|
"""
|
|
return build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
|
|
def _handle_http_error(self, exc: HttpError) -> None:
|
|
"""Classify an HttpError and raise CloudConnectionError if appropriate."""
|
|
status = exc.resp.status
|
|
if status == 401:
|
|
# Token expired — API layer should refresh and retry
|
|
raise CloudConnectionError(
|
|
"Google Drive access token expired", reason="token_expired"
|
|
) from exc
|
|
# Check for invalid_grant in the error body
|
|
if status == 400:
|
|
body = exc.content.decode("utf-8", errors="replace").lower()
|
|
if "invalid_grant" in body:
|
|
raise CloudConnectionError(
|
|
"Google Drive refresh token revoked (invalid_grant)",
|
|
reason="invalid_grant",
|
|
) from exc
|
|
raise exc
|
|
|
|
# ── StorageBackend interface ──────────────────────────────────────────────
|
|
|
|
async def put_object(
|
|
self,
|
|
user_id: str,
|
|
document_id: str,
|
|
file_bytes: bytes,
|
|
extension: str,
|
|
content_type: str,
|
|
) -> str:
|
|
"""Upload bytes to Google Drive and return the Drive file_id as object_key.
|
|
|
|
The file is named "{document_id}{extension}" in Drive — no human filename
|
|
is used in provider storage (D-04 spirit / T-05-03-04).
|
|
|
|
Returns the Google Drive file_id (object_key for this document).
|
|
"""
|
|
file_name = f"{document_id}{extension}"
|
|
file_metadata = {"name": file_name}
|
|
buf = io.BytesIO(file_bytes)
|
|
|
|
def _upload() -> str:
|
|
service = self._get_service()
|
|
media = MediaIoBaseUpload(buf, mimetype=content_type, resumable=False)
|
|
try:
|
|
result = (
|
|
service.files()
|
|
.create(body=file_metadata, media_body=media, fields="id")
|
|
.execute()
|
|
)
|
|
return result["id"]
|
|
except HttpError as exc:
|
|
self._handle_http_error(exc)
|
|
|
|
return await asyncio.to_thread(_upload)
|
|
|
|
async def get_object(self, object_key: str) -> bytes:
|
|
"""Download file bytes from Google Drive by Drive file_id.
|
|
|
|
Uses MediaIoBaseDownload to stream the file into a BytesIO buffer.
|
|
"""
|
|
|
|
def _download() -> bytes:
|
|
service = self._get_service()
|
|
try:
|
|
request = service.files().get_media(fileId=object_key)
|
|
buf = io.BytesIO()
|
|
downloader = MediaIoBaseDownload(buf, request)
|
|
done = False
|
|
while not done:
|
|
_, done = downloader.next_chunk()
|
|
return buf.getvalue()
|
|
except HttpError as exc:
|
|
self._handle_http_error(exc)
|
|
|
|
return await asyncio.to_thread(_download)
|
|
|
|
async def delete_object(self, object_key: str) -> None:
|
|
"""Delete a Drive file by file_id. Silent no-op if the file is not found (404)."""
|
|
|
|
def _delete() -> None:
|
|
service = self._get_service()
|
|
try:
|
|
service.files().delete(fileId=object_key).execute()
|
|
except HttpError as exc:
|
|
if exc.resp.status == 404:
|
|
return # Already deleted — no-op per StorageBackend contract
|
|
self._handle_http_error(exc)
|
|
|
|
await asyncio.to_thread(_delete)
|
|
|
|
async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str:
|
|
"""Not supported by Google Drive — raises NotImplementedError (D-14).
|
|
|
|
Use get_object() for direct streaming instead. The API layer proxies
|
|
content through the FastAPI endpoint.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Google Drive backend does not support presigned URLs — "
|
|
"use get_object() for streaming"
|
|
)
|
|
|
|
async def generate_presigned_put_url(
|
|
self, object_key: str, expires_minutes: int = 15
|
|
) -> str:
|
|
"""Not supported by Google Drive — raises NotImplementedError (D-14).
|
|
|
|
Use put_object() for direct upload via FastAPI intermediary.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Google Drive backend does not support presigned put URLs — "
|
|
"use put_object() for direct upload"
|
|
)
|
|
|
|
async def stat_object(self, object_key: str) -> int:
|
|
"""Return the file size in bytes from Google Drive metadata.
|
|
|
|
Calls files().get(fileId=object_key, fields='size') and returns
|
|
int(metadata.get('size', 0)).
|
|
|
|
Note: Google Workspace files (Docs, Sheets, Slides) have no binary size
|
|
and return an empty 'size' field. DocuVault only uploads binary files,
|
|
so the 0 fallback handles this edge case.
|
|
"""
|
|
|
|
def _stat() -> int:
|
|
service = self._get_service()
|
|
try:
|
|
metadata = (
|
|
service.files()
|
|
.get(fileId=object_key, fields="size")
|
|
.execute()
|
|
)
|
|
return int(metadata.get("size", 0))
|
|
except HttpError as exc:
|
|
self._handle_http_error(exc)
|
|
|
|
return await asyncio.to_thread(_stat)
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Return True if the Drive service is reachable and the token is valid.
|
|
|
|
Makes a minimal files().list(pageSize=1) call. Returns False on any error
|
|
to avoid propagating transient failures up the call stack.
|
|
"""
|
|
|
|
def _check() -> bool:
|
|
service = self._get_service()
|
|
service.files().list(pageSize=1, fields="files(id)").execute()
|
|
return True
|
|
|
|
try:
|
|
return await asyncio.to_thread(_check)
|
|
except Exception:
|
|
return False
|