diff --git a/drb-c2-core/.env.example b/drb-c2-core/.env.example index e950e00..edab2f2 100644 --- a/drb-c2-core/.env.example +++ b/drb-c2-core/.env.example @@ -18,6 +18,9 @@ GCS_BUCKET=your-bucket-name # How long (seconds) before a node is marked offline if no checkin received NODE_OFFLINE_THRESHOLD=90 +# OpenAI Whisper — for audio transcription +OPENAI_API_KEY= + # Auth — static key that edge nodes send as Bearer token on /upload # Generate with: openssl rand -hex 32 NODE_API_KEY= diff --git a/drb-c2-core/app/config.py b/drb-c2-core/app/config.py index 0502e4f..61ee209 100644 --- a/drb-c2-core/app/config.py +++ b/drb-c2-core/app/config.py @@ -17,6 +17,9 @@ class Settings(BaseSettings): # Node health node_offline_threshold: int = 90 # seconds without checkin before marking offline + # OpenAI + openai_api_key: Optional[str] = None + # Internal service key — allows server-side services (discord bot) to call C2 without Firebase service_key: Optional[str] = None diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py index c05600c..47fe9ed 100644 --- a/drb-c2-core/app/internal/transcription.py +++ b/drb-c2-core/app/internal/transcription.py @@ -1,13 +1,12 @@ """ -Speech-to-text transcription for recorded calls. +Speech-to-text transcription for recorded calls using OpenAI Whisper. -Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service -account used by firebase-admin and google-cloud-storage). - -Triggered as a background task from the upload endpoint after a call audio -file has been successfully stored in GCS. +Audio is downloaded from GCS then sent to the Whisper API. Falls back to +returning None on any failure so the intelligence pipeline can still run. """ import asyncio +import tempfile +import os from typing import Optional from app.internal.logger import logger from app.internal import firestore as fstore @@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]: def _sync_transcribe(gcs_uri: str) -> Optional[str]: - """Synchronous STT call — run in a thread via asyncio.to_thread.""" - from google.cloud import speech + """Download audio from GCS and transcribe with OpenAI Whisper.""" + from google.cloud import storage as gcs + from google.oauth2 import service_account + from openai import OpenAI from app.config import settings + if not settings.openai_api_key: + logger.warning("OPENAI_API_KEY not set — transcription disabled.") + return None + + # Parse gs://bucket/path/to/file.mp3 + without_scheme = gcs_uri[len("gs://"):] + bucket_name, blob_path = without_scheme.split("/", 1) + + # Download to a temp file if settings.gcp_credentials_path: - from google.oauth2 import service_account creds = service_account.Credentials.from_service_account_file( settings.gcp_credentials_path, scopes=["https://www.googleapis.com/auth/cloud-platform"], ) - client = speech.SpeechClient(credentials=creds) + gcs_client = gcs.Client(credentials=creds) else: - client = speech.SpeechClient() + gcs_client = gcs.Client() - audio = speech.RecognitionAudio(uri=gcs_uri) - config = speech.RecognitionConfig( - encoding=speech.RecognitionConfig.AudioEncoding.MP3, - sample_rate_hertz=22050, - language_code="en-US", - enable_automatic_punctuation=True, - model="latest_long", - ) + bucket = gcs_client.bucket(bucket_name) + blob = bucket.blob(blob_path) - # Use long_running_recognize for reliability; it handles both short and long audio - operation = client.long_running_recognize(config=config, audio=audio) - response = operation.result(timeout=120) + suffix = os.path.splitext(blob_path)[1] or ".mp3" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp_path = tmp.name - parts = [ - result.alternatives[0].transcript - for result in response.results - if result.alternatives - ] - return " ".join(parts).strip() or None + try: + blob.download_to_filename(tmp_path) + + openai_client = OpenAI(api_key=settings.openai_api_key) + with open(tmp_path, "rb") as f: + response = openai_client.audio.transcriptions.create( + model="whisper-1", + file=f, + language="en", + prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.", + ) + return response.text.strip() or None + finally: + try: + os.unlink(tmp_path) + except OSError: + pass diff --git a/drb-c2-core/requirements.txt b/drb-c2-core/requirements.txt index 160b080..40c7d5b 100644 --- a/drb-c2-core/requirements.txt +++ b/drb-c2-core/requirements.txt @@ -4,7 +4,7 @@ pydantic-settings paho-mqtt>=2.0.0 firebase-admin google-cloud-storage -google-cloud-speech +openai httpx python-multipart pytest