change model to whisper

This commit is contained in:
Logan
2026-04-12 22:36:21 -04:00
parent b29dcc1518
commit 757bfe82e0
4 changed files with 49 additions and 29 deletions
+42 -28
View File
@@ -1,13 +1,12 @@
"""
Speech-to-text transcription for recorded calls.
Speech-to-text transcription for recorded calls using OpenAI Whisper.
Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
account used by firebase-admin and google-cloud-storage).
Triggered as a background task from the upload endpoint after a call audio
file has been successfully stored in GCS.
Audio is downloaded from GCS then sent to the Whisper API. Falls back to
returning None on any failure so the intelligence pipeline can still run.
"""
import asyncio
import tempfile
import os
from typing import Optional
from app.internal.logger import logger
from app.internal import firestore as fstore
@@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
def _sync_transcribe(gcs_uri: str) -> Optional[str]:
"""Synchronous STT call — run in a thread via asyncio.to_thread."""
from google.cloud import speech
"""Download audio from GCS and transcribe with OpenAI Whisper."""
from google.cloud import storage as gcs
from google.oauth2 import service_account
from openai import OpenAI
from app.config import settings
if not settings.openai_api_key:
logger.warning("OPENAI_API_KEY not set — transcription disabled.")
return None
# Parse gs://bucket/path/to/file.mp3
without_scheme = gcs_uri[len("gs://"):]
bucket_name, blob_path = without_scheme.split("/", 1)
# Download to a temp file
if settings.gcp_credentials_path:
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file(
settings.gcp_credentials_path,
scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = speech.SpeechClient(credentials=creds)
gcs_client = gcs.Client(credentials=creds)
else:
client = speech.SpeechClient()
gcs_client = gcs.Client()
audio = speech.RecognitionAudio(uri=gcs_uri)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=22050,
language_code="en-US",
enable_automatic_punctuation=True,
model="latest_long",
)
bucket = gcs_client.bucket(bucket_name)
blob = bucket.blob(blob_path)
# Use long_running_recognize for reliability; it handles both short and long audio
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=120)
suffix = os.path.splitext(blob_path)[1] or ".mp3"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp_path = tmp.name
parts = [
result.alternatives[0].transcript
for result in response.results
if result.alternatives
]
return " ".join(parts).strip() or None
try:
blob.download_to_filename(tmp_path)
openai_client = OpenAI(api_key=settings.openai_api_key)
with open(tmp_path, "rb") as f:
response = openai_client.audio.transcriptions.create(
model="whisper-1",
file=f,
language="en",
prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.",
)
return response.text.strip() or None
finally:
try:
os.unlink(tmp_path)
except OSError:
pass