change model to whisper

2026-04-12 22:36:21 -04:00
parent b29dcc1518
commit 757bfe82e0
4 changed files with 49 additions and 29 deletions
@@ -18,6 +18,9 @@ GCS_BUCKET=your-bucket-name
 # How long (seconds) before a node is marked offline if no checkin received
 NODE_OFFLINE_THRESHOLD=90

+# OpenAI Whisper — for audio transcription
+OPENAI_API_KEY=
+
 # Auth — static key that edge nodes send as Bearer token on /upload
 # Generate with: openssl rand -hex 32
 NODE_API_KEY=
@@ -17,6 +17,9 @@ class Settings(BaseSettings):
    # Node health
    node_offline_threshold: int = 90  # seconds without checkin before marking offline

+    # OpenAI
+    openai_api_key: Optional[str] = None
+
    # Internal service key — allows server-side services (discord bot) to call C2 without Firebase
    service_key: Optional[str] = None

@@ -1,13 +1,12 @@
 """
-Speech-to-text transcription for recorded calls.
+Speech-to-text transcription for recorded calls using OpenAI Whisper.

-Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
-account used by firebase-admin and google-cloud-storage).
-
-Triggered as a background task from the upload endpoint after a call audio
-file has been successfully stored in GCS.
+Audio is downloaded from GCS then sent to the Whisper API. Falls back to
+returning None on any failure so the intelligence pipeline can still run.
 """
 import asyncio
+import tempfile
+import os
 from typing import Optional
 from app.internal.logger import logger
 from app.internal import firestore as fstore
@@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:


 def _sync_transcribe(gcs_uri: str) -> Optional[str]:
-    """Synchronous STT call — run in a thread via asyncio.to_thread."""
-    from google.cloud import speech
+    """Download audio from GCS and transcribe with OpenAI Whisper."""
+    from google.cloud import storage as gcs
+    from google.oauth2 import service_account
+    from openai import OpenAI
    from app.config import settings

+    if not settings.openai_api_key:
+        logger.warning("OPENAI_API_KEY not set — transcription disabled.")
+        return None
+
+    # Parse gs://bucket/path/to/file.mp3
+    without_scheme = gcs_uri[len("gs://"):]
+    bucket_name, blob_path = without_scheme.split("/", 1)
+
+    # Download to a temp file
    if settings.gcp_credentials_path:
-        from google.oauth2 import service_account
        creds = service_account.Credentials.from_service_account_file(
            settings.gcp_credentials_path,
            scopes=["https://www.googleapis.com/auth/cloud-platform"],
        )
-        client = speech.SpeechClient(credentials=creds)
+        gcs_client = gcs.Client(credentials=creds)
    else:
-        client = speech.SpeechClient()
+        gcs_client = gcs.Client()

-    audio = speech.RecognitionAudio(uri=gcs_uri)
-    config = speech.RecognitionConfig(
-        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
-        sample_rate_hertz=22050,
-        language_code="en-US",
-        enable_automatic_punctuation=True,
-        model="latest_long",
-    )
+    bucket = gcs_client.bucket(bucket_name)
+    blob = bucket.blob(blob_path)

-    # Use long_running_recognize for reliability; it handles both short and long audio
-    operation = client.long_running_recognize(config=config, audio=audio)
-    response = operation.result(timeout=120)
+    suffix = os.path.splitext(blob_path)[1] or ".mp3"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp_path = tmp.name

-    parts = [
-        result.alternatives[0].transcript
-        for result in response.results
-        if result.alternatives
-    ]
-    return " ".join(parts).strip() or None
+    try:
+        blob.download_to_filename(tmp_path)
+
+        openai_client = OpenAI(api_key=settings.openai_api_key)
+        with open(tmp_path, "rb") as f:
+            response = openai_client.audio.transcriptions.create(
+                model="whisper-1",
+                file=f,
+                language="en",
+                prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.",
+            )
+        return response.text.strip() or None
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except OSError:
+            pass
@@ -4,7 +4,7 @@ pydantic-settings
 paho-mqtt>=2.0.0
 firebase-admin
 google-cloud-storage
-google-cloud-speech
+openai
 httpx
 python-multipart
 pytest