change model to whisper

2026-04-12 22:36:21 -04:00
parent b29dcc1518
commit 757bfe82e0
4 changed files with 49 additions and 29 deletions
@@ -18,6 +18,9 @@ GCS_BUCKET=your-bucket-name
 # How long (seconds) before a node is marked offline if no checkin received
 NODE_OFFLINE_THRESHOLD=90
 # OpenAI Whisper — for audio transcription
 OPENAI_API_KEY=
 # Auth — static key that edge nodes send as Bearer token on /upload
 # Generate with: openssl rand -hex 32
 NODE_API_KEY=
@@ -17,6 +17,9 @@ class Settings(BaseSettings):
    # Node health
    node_offline_threshold: int = 90  # seconds without checkin before marking offline
    # OpenAI
    openai_api_key: Optional[str] = None
    # Internal service key — allows server-side services (discord bot) to call C2 without Firebase
    service_key: Optional[str] = None
@@ -1,13 +1,12 @@
 """
-Speech-to-text transcription for recorded calls.
+Speech-to-text transcription for recorded calls using OpenAI Whisper.
-Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
+Audio is downloaded from GCS then sent to the Whisper API. Falls back to
-account used by firebase-admin and google-cloud-storage).
+returning None on any failure so the intelligence pipeline can still run.
 Triggered as a background task from the upload endpoint after a call audio
 file has been successfully stored in GCS.
 """
 import asyncio
 import tempfile
 import os
 from typing import Optional
 from app.internal.logger import logger
 from app.internal import firestore as fstore
@@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
 def _sync_transcribe(gcs_uri: str) -> Optional[str]:
-    """Synchronous STT call — run in a thread via asyncio.to_thread."""
+    """Download audio from GCS and transcribe with OpenAI Whisper."""
-    from google.cloud import speech
+    from google.cloud import storage as gcs
    from google.oauth2 import service_account
    from openai import OpenAI
    from app.config import settings
    if not settings.openai_api_key:
        logger.warning("OPENAI_API_KEY not set — transcription disabled.")
        return None
    # Parse gs://bucket/path/to/file.mp3
    without_scheme = gcs_uri[len("gs://"):]
    bucket_name, blob_path = without_scheme.split("/", 1)
    # Download to a temp file
    if settings.gcp_credentials_path:
        from google.oauth2 import service_account
        creds = service_account.Credentials.from_service_account_file(
            settings.gcp_credentials_path,
            scopes=["https://www.googleapis.com/auth/cloud-platform"],
        )
-        client = speech.SpeechClient(credentials=creds)
+        gcs_client = gcs.Client(credentials=creds)
    else:
-        client = speech.SpeechClient()
+        gcs_client = gcs.Client()
-    audio = speech.RecognitionAudio(uri=gcs_uri)
+    bucket = gcs_client.bucket(bucket_name)
-    config = speech.RecognitionConfig(
+    blob = bucket.blob(blob_path)
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=22050,
        language_code="en-US",
        enable_automatic_punctuation=True,
        model="latest_long",
    )
-    # Use long_running_recognize for reliability; it handles both short and long audio
+    suffix = os.path.splitext(blob_path)[1] or ".mp3"
-    operation = client.long_running_recognize(config=config, audio=audio)
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
-    response = operation.result(timeout=120)
+        tmp_path = tmp.name
-    parts = [
+    try:
-        result.alternatives[0].transcript
+        blob.download_to_filename(tmp_path)
-        for result in response.results
+
-        if result.alternatives
+        openai_client = OpenAI(api_key=settings.openai_api_key)
-    ]
+        with open(tmp_path, "rb") as f:
-    return " ".join(parts).strip() or None
+            response = openai_client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                language="en",
                prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.",
            )
        return response.text.strip() or None
    finally:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass
@@ -4,7 +4,7 @@ pydantic-settings
 paho-mqtt>=2.0.0
 firebase-admin
 google-cloud-storage
-google-cloud-speech
+openai
 httpx
 python-multipart
 pytest