STT bugfix

This commit is contained in:
Logan
2026-05-17 19:37:38 -04:00
parent 97ed691cd2
commit 9d73fc52fa
+34 -25
View File
@@ -113,6 +113,10 @@ def _sync_transcribe(
tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else "" tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT
# Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob).
# Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text.
use_verbose = settings.stt_model == "whisper-1"
openai_client = OpenAI(api_key=settings.openai_api_key) openai_client = OpenAI(api_key=settings.openai_api_key)
with open(tmp_path, "rb") as f: with open(tmp_path, "rb") as f:
response = openai_client.audio.transcriptions.create( response = openai_client.audio.transcriptions.create(
@@ -120,33 +124,38 @@ def _sync_transcribe(
file=f, file=f,
language="en", language="en",
prompt=prompt, prompt=prompt,
response_format="verbose_json", response_format="verbose_json" if use_verbose else "json",
temperature=0, temperature=0,
) )
# Filter hallucinated segments. Two sources of hallucination in P25 recordings:
# if use_verbose:
# 1. Trailing silence / static — Whisper fills silence past real content with # Filter hallucinated segments. Two sources of hallucination in P25 recordings:
# sequential radio codes (10-4, 10-5...). Clamped by audio duration. #
# # 1. Trailing silence / static — Whisper fills silence past real content with
# 2. Leading silence — OP25 recordings typically have a short silence at the # sequential radio codes (10-4, 10-5...). Clamped by audio duration.
# start before the first PTT press. Whisper sometimes hallucinates filler #
# words or codes over this silence. Detected via no_speech_prob > 0.8 # 2. Leading silence — OP25 recordings typically have a short silence at the
# (Whisper's own confidence that a segment contains no real speech). # start before the first PTT press. Whisper sometimes hallucinates filler
audio_duration: float = getattr(response, "duration", None) or float("inf") # words or codes over this silence. Detected via no_speech_prob > 0.8
segments = [ # (Whisper's own confidence that a segment contains no real speech).
{"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()} audio_duration: float = getattr(response, "duration", None) or float("inf")
for s in (response.segments or []) segments = [
if s.text.strip() {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()}
and s.start < audio_duration for s in (response.segments or [])
and getattr(s, "no_speech_prob", 0.0) < 0.8 if s.text.strip()
] and s.start < audio_duration
# Reconstruct text from non-hallucinated segments only so the two stay and getattr(s, "no_speech_prob", 0.0) < 0.8
# in sync. If every segment was filtered (e.g. pure static or repeated ]
# prompt-word hallucination like "Standby. Standby. Standby..."), text # Reconstruct text from non-hallucinated segments only so the two stay
# becomes None which prevents the intelligence pipeline from running on # in sync. If every segment was filtered, text becomes None which prevents
# hallucinated content. # the intelligence pipeline from running on hallucinated content.
text = " ".join(s["text"] for s in segments) or None text = " ".join(s["text"] for s in segments) or None
return text, segments return text, segments
else:
# json format returns just {"text": "..."} — no segments or timestamps.
# Intelligence extraction falls back to treating the whole transcript as one block.
text = (response.text or "").strip() or None
return text, []
finally: finally:
try: try:
os.unlink(tmp_path) os.unlink(tmp_path)