From 9d73fc52fa2415601a11c75f79a2f5022832f756 Mon Sep 17 00:00:00 2001
From: Logan <Logan@simplestepsolutions.com>
Date: Sun, 17 May 2026 19:37:38 -0400
Subject: [PATCH] STT bugfix

---
 drb-c2-core/app/internal/transcription.py | 59 +++++++++++++----------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py
index c7a37d8..8af0db9 100644
--- a/drb-c2-core/app/internal/transcription.py
+++ b/drb-c2-core/app/internal/transcription.py
@@ -113,6 +113,10 @@ def _sync_transcribe(
         tg_prefix    = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
         prompt       = tg_prefix + vocab_prefix + _WHISPER_PROMPT
 
+        # Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob).
+        # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text.
+        use_verbose = settings.stt_model == "whisper-1"
+
         openai_client = OpenAI(api_key=settings.openai_api_key)
         with open(tmp_path, "rb") as f:
             response = openai_client.audio.transcriptions.create(
@@ -120,33 +124,38 @@ def _sync_transcribe(
                 file=f,
                 language="en",
                 prompt=prompt,
-                response_format="verbose_json",
+                response_format="verbose_json" if use_verbose else "json",
                 temperature=0,
             )
-        # Filter hallucinated segments.  Two sources of hallucination in P25 recordings:
-        #
-        # 1. Trailing silence / static — Whisper fills silence past real content with
-        #    sequential radio codes (10-4, 10-5...).  Clamped by audio duration.
-        #
-        # 2. Leading silence — OP25 recordings typically have a short silence at the
-        #    start before the first PTT press.  Whisper sometimes hallucinates filler
-        #    words or codes over this silence.  Detected via no_speech_prob > 0.8
-        #    (Whisper's own confidence that a segment contains no real speech).
-        audio_duration: float = getattr(response, "duration", None) or float("inf")
-        segments = [
-            {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()}
-            for s in (response.segments or [])
-            if s.text.strip()
-            and s.start < audio_duration
-            and getattr(s, "no_speech_prob", 0.0) < 0.8
-        ]
-        # Reconstruct text from non-hallucinated segments only so the two stay
-        # in sync.  If every segment was filtered (e.g. pure static or repeated
-        # prompt-word hallucination like "Standby. Standby. Standby..."), text
-        # becomes None which prevents the intelligence pipeline from running on
-        # hallucinated content.
-        text = " ".join(s["text"] for s in segments) or None
-        return text, segments
+
+        if use_verbose:
+            # Filter hallucinated segments.  Two sources of hallucination in P25 recordings:
+            #
+            # 1. Trailing silence / static — Whisper fills silence past real content with
+            #    sequential radio codes (10-4, 10-5...).  Clamped by audio duration.
+            #
+            # 2. Leading silence — OP25 recordings typically have a short silence at the
+            #    start before the first PTT press.  Whisper sometimes hallucinates filler
+            #    words or codes over this silence.  Detected via no_speech_prob > 0.8
+            #    (Whisper's own confidence that a segment contains no real speech).
+            audio_duration: float = getattr(response, "duration", None) or float("inf")
+            segments = [
+                {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()}
+                for s in (response.segments or [])
+                if s.text.strip()
+                and s.start < audio_duration
+                and getattr(s, "no_speech_prob", 0.0) < 0.8
+            ]
+            # Reconstruct text from non-hallucinated segments only so the two stay
+            # in sync.  If every segment was filtered, text becomes None which prevents
+            # the intelligence pipeline from running on hallucinated content.
+            text = " ".join(s["text"] for s in segments) or None
+            return text, segments
+        else:
+            # json format returns just {"text": "..."} — no segments or timestamps.
+            # Intelligence extraction falls back to treating the whole transcript as one block.
+            text = (response.text or "").strip() or None
+            return text, []
     finally:
         try:
             os.unlink(tmp_path)