change model to whisper
This commit is contained in:
@@ -18,6 +18,9 @@ GCS_BUCKET=your-bucket-name
|
|||||||
# How long (seconds) before a node is marked offline if no checkin received
|
# How long (seconds) before a node is marked offline if no checkin received
|
||||||
NODE_OFFLINE_THRESHOLD=90
|
NODE_OFFLINE_THRESHOLD=90
|
||||||
|
|
||||||
|
# OpenAI Whisper — for audio transcription
|
||||||
|
OPENAI_API_KEY=
|
||||||
|
|
||||||
# Auth — static key that edge nodes send as Bearer token on /upload
|
# Auth — static key that edge nodes send as Bearer token on /upload
|
||||||
# Generate with: openssl rand -hex 32
|
# Generate with: openssl rand -hex 32
|
||||||
NODE_API_KEY=
|
NODE_API_KEY=
|
||||||
|
|||||||
@@ -17,6 +17,9 @@ class Settings(BaseSettings):
|
|||||||
# Node health
|
# Node health
|
||||||
node_offline_threshold: int = 90 # seconds without checkin before marking offline
|
node_offline_threshold: int = 90 # seconds without checkin before marking offline
|
||||||
|
|
||||||
|
# OpenAI
|
||||||
|
openai_api_key: Optional[str] = None
|
||||||
|
|
||||||
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase
|
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase
|
||||||
service_key: Optional[str] = None
|
service_key: Optional[str] = None
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
Speech-to-text transcription for recorded calls.
|
Speech-to-text transcription for recorded calls using OpenAI Whisper.
|
||||||
|
|
||||||
Uses Google Cloud Speech-to-Text v1 (authenticated via the same ADC / service
|
Audio is downloaded from GCS then sent to the Whisper API. Falls back to
|
||||||
account used by firebase-admin and google-cloud-storage).
|
returning None on any failure so the intelligence pipeline can still run.
|
||||||
|
|
||||||
Triggered as a background task from the upload endpoint after a call audio
|
|
||||||
file has been successfully stored in GCS.
|
|
||||||
"""
|
"""
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from app.internal.logger import logger
|
from app.internal.logger import logger
|
||||||
from app.internal import firestore as fstore
|
from app.internal import firestore as fstore
|
||||||
@@ -44,36 +43,51 @@ async def transcribe_call(call_id: str, gcs_uri: str) -> Optional[str]:
|
|||||||
|
|
||||||
|
|
||||||
def _sync_transcribe(gcs_uri: str) -> Optional[str]:
|
def _sync_transcribe(gcs_uri: str) -> Optional[str]:
|
||||||
"""Synchronous STT call — run in a thread via asyncio.to_thread."""
|
"""Download audio from GCS and transcribe with OpenAI Whisper."""
|
||||||
from google.cloud import speech
|
from google.cloud import storage as gcs
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
from openai import OpenAI
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
|
|
||||||
|
if not settings.openai_api_key:
|
||||||
|
logger.warning("OPENAI_API_KEY not set — transcription disabled.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Parse gs://bucket/path/to/file.mp3
|
||||||
|
without_scheme = gcs_uri[len("gs://"):]
|
||||||
|
bucket_name, blob_path = without_scheme.split("/", 1)
|
||||||
|
|
||||||
|
# Download to a temp file
|
||||||
if settings.gcp_credentials_path:
|
if settings.gcp_credentials_path:
|
||||||
from google.oauth2 import service_account
|
|
||||||
creds = service_account.Credentials.from_service_account_file(
|
creds = service_account.Credentials.from_service_account_file(
|
||||||
settings.gcp_credentials_path,
|
settings.gcp_credentials_path,
|
||||||
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
||||||
)
|
)
|
||||||
client = speech.SpeechClient(credentials=creds)
|
gcs_client = gcs.Client(credentials=creds)
|
||||||
else:
|
else:
|
||||||
client = speech.SpeechClient()
|
gcs_client = gcs.Client()
|
||||||
|
|
||||||
audio = speech.RecognitionAudio(uri=gcs_uri)
|
bucket = gcs_client.bucket(bucket_name)
|
||||||
config = speech.RecognitionConfig(
|
blob = bucket.blob(blob_path)
|
||||||
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
|
|
||||||
sample_rate_hertz=22050,
|
|
||||||
language_code="en-US",
|
|
||||||
enable_automatic_punctuation=True,
|
|
||||||
model="latest_long",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use long_running_recognize for reliability; it handles both short and long audio
|
suffix = os.path.splitext(blob_path)[1] or ".mp3"
|
||||||
operation = client.long_running_recognize(config=config, audio=audio)
|
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||||
response = operation.result(timeout=120)
|
tmp_path = tmp.name
|
||||||
|
|
||||||
parts = [
|
try:
|
||||||
result.alternatives[0].transcript
|
blob.download_to_filename(tmp_path)
|
||||||
for result in response.results
|
|
||||||
if result.alternatives
|
openai_client = OpenAI(api_key=settings.openai_api_key)
|
||||||
]
|
with open(tmp_path, "rb") as f:
|
||||||
return " ".join(parts).strip() or None
|
response = openai_client.audio.transcriptions.create(
|
||||||
|
model="whisper-1",
|
||||||
|
file=f,
|
||||||
|
language="en",
|
||||||
|
prompt="Public safety radio communication. May include police codes, fire, EMS, talkgroup IDs, unit numbers, addresses.",
|
||||||
|
)
|
||||||
|
return response.text.strip() or None
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ pydantic-settings
|
|||||||
paho-mqtt>=2.0.0
|
paho-mqtt>=2.0.0
|
||||||
firebase-admin
|
firebase-admin
|
||||||
google-cloud-storage
|
google-cloud-storage
|
||||||
google-cloud-speech
|
openai
|
||||||
httpx
|
httpx
|
||||||
python-multipart
|
python-multipart
|
||||||
pytest
|
pytest
|
||||||
|
|||||||
Reference in New Issue
Block a user