server-26/drb-c2-core/app/internal/transcription.py

"""
Speech-to-text transcription for recorded calls.

Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
account used by firebase-admin and google-cloud-storage).

Triggered as a background task from the upload endpoint after a call audio
file has been successfully stored in GCS.
"""
import asyncio
from typing import Optional
from app.internal.logger import logger
from app.internal import firestore as fstore


async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
    """
    Transcribe audio at the given GCS URI and store the result in Firestore.

    Args:
        call_id: Firestore document ID in the 'calls' collection.
        gcs_uri:  GCS URI of the audio file, e.g. gs://bucket/calls/xyz.mp3

    Returns:
        The transcript string, or None if transcription failed / was skipped.
    """
    if not gcs_uri or not gcs_uri.startswith("gs://"):
        return None

    try:
        transcript = await asyncio.to_thread(_sync_transcribe, gcs_uri)
    except Exception as e:
        logger.warning(f"Transcription failed for call {call_id}: {e}")
        return None

    if transcript:
        try:
            await fstore.doc_update("calls", call_id, {"transcript": transcript})
            logger.info(f"Transcript saved for call {call_id} ({len(transcript)} chars)")
        except Exception as e:
            logger.warning(f"Could not save transcript for {call_id}: {e}")

    return transcript


def _sync_transcribe(gcs_uri: str) -> Optional[str]:
    """Synchronous STT call — run in a thread via asyncio.to_thread."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=22050,
        language_code="en-US",
        enable_automatic_punctuation=True,
        model="latest_long",
    )

    # Use long_running_recognize for reliability; it handles both short and long audio
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=120)

    parts = [
        result.alternatives[0].transcript
        for result in response.results
        if result.alternatives
    ]
    return " ".join(parts).strip() or None