Files
server-26/drb-c2-core/app/internal/transcription.py
T
Logan 357553f1ea Issue Fix
Upload 404 warning	doc_set(merge=True) in upload.py — creates doc if missing
MQTT call_end 404 error	doc_set(merge=True) in mqtt_handler.py — same root cause
Transcription 404 (saving transcript to nonexistent doc)	doc_set(merge=True) in transcription.py
Transcription ADC credentials error	Explicit service_account.Credentials from gcp-key.json in _sync_transcribe — same pattern as storage.py
2026-04-12 22:04:11 -04:00

80 lines
2.7 KiB
Python

"""
Speech-to-text transcription for recorded calls.
Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
account used by firebase-admin and google-cloud-storage).
Triggered as a background task from the upload endpoint after a call audio
file has been successfully stored in GCS.
"""
import asyncio
from typing import Optional
from app.internal.logger import logger
from app.internal import firestore as fstore
async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
"""
Transcribe audio at the given GCS URI and store the result in Firestore.
Args:
call_id: Firestore document ID in the 'calls' collection.
gcs_uri: GCS URI of the audio file, e.g. gs://bucket/calls/xyz.mp3
Returns:
The transcript string, or None if transcription failed / was skipped.
"""
if not gcs_uri or not gcs_uri.startswith("gs://"):
return None
try:
transcript = await asyncio.to_thread(_sync_transcribe, gcs_uri)
except Exception as e:
logger.warning(f"Transcription failed for call {call_id}: {e}")
return None
if transcript:
try:
await fstore.doc_set("calls", call_id, {"transcript": transcript})
logger.info(f"Transcript saved for call {call_id} ({len(transcript)} chars)")
except Exception as e:
logger.warning(f"Could not save transcript for {call_id}: {e}")
return transcript
def _sync_transcribe(gcs_uri: str) -> Optional[str]:
"""Synchronous STT call — run in a thread via asyncio.to_thread."""
from google.cloud import speech
from app.config import settings
if settings.gcp_credentials_path:
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file(
settings.gcp_credentials_path,
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = speech.SpeechClient(credentials=creds)
else:
client = speech.SpeechClient()
audio = speech.RecognitionAudio(uri=gcs_uri)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=22050,
language_code="en-US",
enable_automatic_punctuation=True,
model="latest_long",
)
# Use long_running_recognize for reliability; it handles both short and long audio
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=120)
parts = [
result.alternatives[0].transcript
for result in response.results
if result.alternatives
]
return " ".join(parts).strip() or None