From 9d73fc52fa2415601a11c75f79a2f5022832f756 Mon Sep 17 00:00:00 2001 From: Logan Date: Sun, 17 May 2026 19:37:38 -0400 Subject: [PATCH] STT bugfix --- drb-c2-core/app/internal/transcription.py | 59 +++++++++++++---------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py index c7a37d8..8af0db9 100644 --- a/drb-c2-core/app/internal/transcription.py +++ b/drb-c2-core/app/internal/transcription.py @@ -113,6 +113,10 @@ def _sync_transcribe( tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else "" prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT + # Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob). + # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text. + use_verbose = settings.stt_model == "whisper-1" + openai_client = OpenAI(api_key=settings.openai_api_key) with open(tmp_path, "rb") as f: response = openai_client.audio.transcriptions.create( @@ -120,33 +124,38 @@ def _sync_transcribe( file=f, language="en", prompt=prompt, - response_format="verbose_json", + response_format="verbose_json" if use_verbose else "json", temperature=0, ) - # Filter hallucinated segments. Two sources of hallucination in P25 recordings: - # - # 1. Trailing silence / static — Whisper fills silence past real content with - # sequential radio codes (10-4, 10-5...). Clamped by audio duration. - # - # 2. Leading silence — OP25 recordings typically have a short silence at the - # start before the first PTT press. Whisper sometimes hallucinates filler - # words or codes over this silence. Detected via no_speech_prob > 0.8 - # (Whisper's own confidence that a segment contains no real speech). - audio_duration: float = getattr(response, "duration", None) or float("inf") - segments = [ - {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()} - for s in (response.segments or []) - if s.text.strip() - and s.start < audio_duration - and getattr(s, "no_speech_prob", 0.0) < 0.8 - ] - # Reconstruct text from non-hallucinated segments only so the two stay - # in sync. If every segment was filtered (e.g. pure static or repeated - # prompt-word hallucination like "Standby. Standby. Standby..."), text - # becomes None which prevents the intelligence pipeline from running on - # hallucinated content. - text = " ".join(s["text"] for s in segments) or None - return text, segments + + if use_verbose: + # Filter hallucinated segments. Two sources of hallucination in P25 recordings: + # + # 1. Trailing silence / static — Whisper fills silence past real content with + # sequential radio codes (10-4, 10-5...). Clamped by audio duration. + # + # 2. Leading silence — OP25 recordings typically have a short silence at the + # start before the first PTT press. Whisper sometimes hallucinates filler + # words or codes over this silence. Detected via no_speech_prob > 0.8 + # (Whisper's own confidence that a segment contains no real speech). + audio_duration: float = getattr(response, "duration", None) or float("inf") + segments = [ + {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()} + for s in (response.segments or []) + if s.text.strip() + and s.start < audio_duration + and getattr(s, "no_speech_prob", 0.0) < 0.8 + ] + # Reconstruct text from non-hallucinated segments only so the two stay + # in sync. If every segment was filtered, text becomes None which prevents + # the intelligence pipeline from running on hallucinated content. + text = " ".join(s["text"] for s in segments) or None + return text, segments + else: + # json format returns just {"text": "..."} — no segments or timestamps. + # Intelligence extraction falls back to treating the whole transcript as one block. + text = (response.text or "").strip() or None + return text, [] finally: try: os.unlink(tmp_path)