STT bugfix

2026-05-17 19:37:38 -04:00
parent 97ed691cd2
commit 9d73fc52fa
1 changed files with 34 additions and 25 deletions
@@ -113,6 +113,10 @@ def _sync_transcribe(
        tg_prefix    = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
        prompt       = tg_prefix + vocab_prefix + _WHISPER_PROMPT

+        # Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob).
+        # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text.
+        use_verbose = settings.stt_model == "whisper-1"
+
        openai_client = OpenAI(api_key=settings.openai_api_key)
        with open(tmp_path, "rb") as f:
            response = openai_client.audio.transcriptions.create(
@@ -120,9 +124,11 @@ def _sync_transcribe(
                file=f,
                language="en",
                prompt=prompt,
-                response_format="verbose_json",
+                response_format="verbose_json" if use_verbose else "json",
                temperature=0,
            )
+
+        if use_verbose:
            # Filter hallucinated segments.  Two sources of hallucination in P25 recordings:
            #
            # 1. Trailing silence / static — Whisper fills silence past real content with
@@ -141,12 +147,15 @@ def _sync_transcribe(
                and getattr(s, "no_speech_prob", 0.0) < 0.8
            ]
            # Reconstruct text from non-hallucinated segments only so the two stay
-        # in sync.  If every segment was filtered (e.g. pure static or repeated
-        # prompt-word hallucination like "Standby. Standby. Standby..."), text
-        # becomes None which prevents the intelligence pipeline from running on
-        # hallucinated content.
+            # in sync.  If every segment was filtered, text becomes None which prevents
+            # the intelligence pipeline from running on hallucinated content.
            text = " ".join(s["text"] for s in segments) or None
            return text, segments
+        else:
+            # json format returns just {"text": "..."} — no segments or timestamps.
+            # Intelligence extraction falls back to treating the whole transcript as one block.
+            text = (response.text or "").strip() or None
+            return text, []
    finally:
        try:
            os.unlink(tmp_path)