Files
kite/backend/storage/google_drive_backend.py
T
curo1305 a548266461 refactor(backend): extract shared helper modules per architecture rules
- Add backend/ai/utils.py — parse_classification, parse_suggestions, strip_code_fences
  shared by all AI providers; removes duplicated private functions from
  anthropic_provider.py and openai_provider.py
- Add backend/deps/utils.py — get_client_ip, parse_uuid request-parsing helpers;
  removes local _ip() variants from admin.py, auth.py, shares.py, folders.py
- Add backend/storage/exceptions.py — canonical CloudConnectionError definition;
  all routers and backends import from here instead of redefining
- Move validate_password_strength to backend/services/auth.py; removes duplicated
  _validate_password_strength from admin.py and auth.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 16:10:35 +02:00

243 lines
9.5 KiB
Python

"""
Google Drive v3 StorageBackend implementation for DocuVault.
Design notes:
- All sync googleapiclient calls are wrapped in asyncio.to_thread() to
avoid blocking the FastAPI event loop (Pitfall 7 — google-api-python-client
is synchronous; every .execute() call makes an HTTP request).
- cache_discovery=False on build() prevents the client library from writing
a JSON discovery document to /tmp, which would be a directory traversal
risk (T-05-03-05).
- D-14: presigned_get_url and generate_presigned_put_url raise
NotImplementedError. The API upload endpoint detects cloud backends and
uses the direct put_object() path instead.
- B2 design: This backend is STATELESS. It raises CloudConnectionError but
does NOT update the DB or CloudConnection objects. DB state transitions
(e.g., REQUIRES_REAUTH) are handled by the _call_cloud_op() helper in
cloud.py (Plan 05-05), which has the DB session.
- Token key format stored in credentials dict:
access_token — current OAuth bearer token
refresh_token — long-lived refresh token
token_uri — OAuth token endpoint
client_id — OAuth client ID
client_secret — OAuth client secret
expiry — ISO 8601 string (optional)
"""
from __future__ import annotations
import asyncio
import datetime
import io
import uuid
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
from storage.base import StorageBackend
from storage.exceptions import CloudConnectionError # noqa: F401 re-exported for import compatibility
class GoogleDriveBackend(StorageBackend):
"""Google Drive v3 implementation of StorageBackend.
Every sync googleapiclient call is wrapped in asyncio.to_thread() (Pitfall 7).
The backend is stateless — it raises CloudConnectionError on token issues
but never writes to the DB.
"""
SCOPES = ["https://www.googleapis.com/auth/drive.file"]
def __init__(self, credentials: dict) -> None:
"""Initialise with a decrypted credentials dict.
Expected keys:
access_token str — current OAuth bearer token
refresh_token str — long-lived refresh token (may be None)
token_uri str — defaults to https://oauth2.googleapis.com/token
client_id str
client_secret str
expiry str — ISO 8601 datetime string (optional)
"""
self._creds_dict = credentials
self._creds = self._dict_to_google_creds(credentials)
# ── Internal helpers ──────────────────────────────────────────────────────
def _dict_to_google_creds(self, d: dict) -> Credentials:
"""Build a google.oauth2.credentials.Credentials object from a stored dict."""
creds = Credentials(
token=d["access_token"],
refresh_token=d.get("refresh_token"),
token_uri=d.get("token_uri", "https://oauth2.googleapis.com/token"),
client_id=d.get("client_id"),
client_secret=d.get("client_secret"),
)
if d.get("expiry"):
creds.expiry = datetime.datetime.fromisoformat(d["expiry"])
return creds
def _get_service(self):
"""Build the Drive v3 service object.
cache_discovery=False prevents the client library from caching the
discovery document in /tmp — important for security (T-05-03-05).
"""
return build("drive", "v3", credentials=self._creds, cache_discovery=False)
def _handle_http_error(self, exc: HttpError) -> None:
"""Classify an HttpError and raise CloudConnectionError if appropriate."""
status = exc.resp.status
if status == 401:
# Token expired — API layer should refresh and retry
raise CloudConnectionError(
"Google Drive access token expired", reason="token_expired"
) from exc
# Check for invalid_grant in the error body
if status == 400:
body = exc.content.decode("utf-8", errors="replace").lower()
if "invalid_grant" in body:
raise CloudConnectionError(
"Google Drive refresh token revoked (invalid_grant)",
reason="invalid_grant",
) from exc
raise exc
# ── StorageBackend interface ──────────────────────────────────────────────
async def put_object(
self,
user_id: str,
document_id: str,
file_bytes: bytes,
extension: str,
content_type: str,
) -> str:
"""Upload bytes to Google Drive and return the Drive file_id as object_key.
The file is named "{document_id}{extension}" in Drive — no human filename
is used in provider storage (D-04 spirit / T-05-03-04).
Returns the Google Drive file_id (object_key for this document).
"""
file_name = f"{document_id}{extension}"
file_metadata = {"name": file_name}
buf = io.BytesIO(file_bytes)
def _upload() -> str:
service = self._get_service()
media = MediaIoBaseUpload(buf, mimetype=content_type, resumable=False)
try:
result = (
service.files()
.create(body=file_metadata, media_body=media, fields="id")
.execute()
)
return result["id"]
except HttpError as exc:
self._handle_http_error(exc)
return await asyncio.to_thread(_upload)
async def get_object(self, object_key: str) -> bytes:
"""Download file bytes from Google Drive by Drive file_id.
Uses MediaIoBaseDownload to stream the file into a BytesIO buffer.
"""
def _download() -> bytes:
service = self._get_service()
try:
request = service.files().get_media(fileId=object_key)
buf = io.BytesIO()
downloader = MediaIoBaseDownload(buf, request)
done = False
while not done:
_, done = downloader.next_chunk()
return buf.getvalue()
except HttpError as exc:
self._handle_http_error(exc)
return await asyncio.to_thread(_download)
async def delete_object(self, object_key: str) -> None:
"""Delete a Drive file by file_id. Silent no-op if the file is not found (404)."""
def _delete() -> None:
service = self._get_service()
try:
service.files().delete(fileId=object_key).execute()
except HttpError as exc:
if exc.resp.status == 404:
return # Already deleted — no-op per StorageBackend contract
self._handle_http_error(exc)
await asyncio.to_thread(_delete)
async def presigned_get_url(self, object_key: str, expires_minutes: int = 60) -> str:
"""Not supported by Google Drive — raises NotImplementedError (D-14).
Use get_object() for direct streaming instead. The API layer proxies
content through the FastAPI endpoint.
"""
raise NotImplementedError(
"Google Drive backend does not support presigned URLs — "
"use get_object() for streaming"
)
async def generate_presigned_put_url(
self, object_key: str, expires_minutes: int = 15
) -> str:
"""Not supported by Google Drive — raises NotImplementedError (D-14).
Use put_object() for direct upload via FastAPI intermediary.
"""
raise NotImplementedError(
"Google Drive backend does not support presigned put URLs — "
"use put_object() for direct upload"
)
async def stat_object(self, object_key: str) -> int:
"""Return the file size in bytes from Google Drive metadata.
Calls files().get(fileId=object_key, fields='size') and returns
int(metadata.get('size', 0)).
Note: Google Workspace files (Docs, Sheets, Slides) have no binary size
and return an empty 'size' field. DocuVault only uploads binary files,
so the 0 fallback handles this edge case.
"""
def _stat() -> int:
service = self._get_service()
try:
metadata = (
service.files()
.get(fileId=object_key, fields="size")
.execute()
)
return int(metadata.get("size", 0))
except HttpError as exc:
self._handle_http_error(exc)
return await asyncio.to_thread(_stat)
async def health_check(self) -> bool:
"""Return True if the Drive service is reachable and the token is valid.
Makes a minimal files().list(pageSize=1) call. Returns False on any error
to avoid propagating transient failures up the call stack.
"""
def _check() -> bool:
service = self._get_service()
service.files().list(pageSize=1, fields="files(id)").execute()
return True
try:
return await asyncio.to_thread(_check)
except Exception:
return False