change model to whisper

This commit is contained in:
Logan
2026-04-12 22:36:21 -04:00
parent b29dcc1518
commit 757bfe82e0
4 changed files with 49 additions and 29 deletions
+3
View File
@@ -18,6 +18,9 @@ GCS_BUCKET=your-bucket-name
# How long (seconds) before a node is marked offline if no checkin received # How long (seconds) before a node is marked offline if no checkin received
NODE_OFFLINE_THRESHOLD=90 NODE_OFFLINE_THRESHOLD=90
# OpenAI Whisper — for audio transcription
OPENAI_API_KEY=
# Auth — static key that edge nodes send as Bearer token on /upload # Auth — static key that edge nodes send as Bearer token on /upload
# Generate with: openssl rand -hex 32 # Generate with: openssl rand -hex 32
NODE_API_KEY= NODE_API_KEY=
+3
View File
@@ -17,6 +17,9 @@ class Settings(BaseSettings):
# Node health # Node health
node_offline_threshold: int = 90 # seconds without checkin before marking offline node_offline_threshold: int = 90 # seconds without checkin before marking offline
# OpenAI
openai_api_key: Optional[str] = None
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase # Internal service key — allows server-side services (discord bot) to call C2 without Firebase
service_key: Optional[str] = None service_key: Optional[str] = None
+42 -28
View File
@@ -1,13 +1,12 @@
""" """
Speech-to-text transcription for recorded calls. Speech-to-text transcription for recorded calls using OpenAI Whisper.
Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service Audio is downloaded from GCS then sent to the Whisper API. Falls back to
account used by firebase-admin and google-cloud-storage). returning None on any failure so the intelligence pipeline can still run.
Triggered as a background task from the upload endpoint after a call audio
file has been successfully stored in GCS.
""" """
import asyncio import asyncio
import tempfile
import os
from typing import Optional from typing import Optional
from app.internal.logger import logger from app.internal.logger import logger
from app.internal import firestore as fstore from app.internal import firestore as fstore
@@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
def _sync_transcribe(gcs_uri: str) -> Optional[str]: def _sync_transcribe(gcs_uri: str) -> Optional[str]:
"""Synchronous STT call — run in a thread via asyncio.to_thread.""" """Download audio from GCS and transcribe with OpenAI Whisper."""
from google.cloud import speech from google.cloud import storage as gcs
from google.oauth2 import service_account
from openai import OpenAI
from app.config import settings from app.config import settings
if not settings.openai_api_key:
logger.warning("OPENAI_API_KEY not set — transcription disabled.")
return None
# Parse gs://bucket/path/to/file.mp3
without_scheme = gcs_uri[len("gs://"):]
bucket_name, blob_path = without_scheme.split("/", 1)
# Download to a temp file
if settings.gcp_credentials_path: if settings.gcp_credentials_path:
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file( creds = service_account.Credentials.from_service_account_file(
settings.gcp_credentials_path, settings.gcp_credentials_path,
scopes=["https://www.googleapis.com/auth/cloud-platform"], scopes=["https://www.googleapis.com/auth/cloud-platform"],
) )
client = speech.SpeechClient(credentials=creds) gcs_client = gcs.Client(credentials=creds)
else: else:
client = speech.SpeechClient() gcs_client = gcs.Client()
audio = speech.RecognitionAudio(uri=gcs_uri) bucket = gcs_client.bucket(bucket_name)
config = speech.RecognitionConfig( blob = bucket.blob(blob_path)
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=22050,
language_code="en-US",
enable_automatic_punctuation=True,
model="latest_long",
)
# Use long_running_recognize for reliability; it handles both short and long audio suffix = os.path.splitext(blob_path)[1] or ".mp3"
operation = client.long_running_recognize(config=config, audio=audio) with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
response = operation.result(timeout=120) tmp_path = tmp.name
parts = [ try:
result.alternatives[0].transcript blob.download_to_filename(tmp_path)
for result in response.results
if result.alternatives openai_client = OpenAI(api_key=settings.openai_api_key)
] with open(tmp_path, "rb") as f:
return " ".join(parts).strip() or None response = openai_client.audio.transcriptions.create(
model="whisper-1",
file=f,
language="en",
prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.",
)
return response.text.strip() or None
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
+1 -1
View File
@@ -4,7 +4,7 @@ pydantic-settings
paho-mqtt>=2.0.0 paho-mqtt>=2.0.0
firebase-admin firebase-admin
google-cloud-storage google-cloud-storage
google-cloud-speech openai
httpx httpx
python-multipart python-multipart
pytest pytest