STT bugfix
This commit is contained in:
@@ -113,6 +113,10 @@ def _sync_transcribe(
|
|||||||
tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
|
tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
|
||||||
prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT
|
prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT
|
||||||
|
|
||||||
|
# Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob).
|
||||||
|
# Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text.
|
||||||
|
use_verbose = settings.stt_model == "whisper-1"
|
||||||
|
|
||||||
openai_client = OpenAI(api_key=settings.openai_api_key)
|
openai_client = OpenAI(api_key=settings.openai_api_key)
|
||||||
with open(tmp_path, "rb") as f:
|
with open(tmp_path, "rb") as f:
|
||||||
response = openai_client.audio.transcriptions.create(
|
response = openai_client.audio.transcriptions.create(
|
||||||
@@ -120,33 +124,38 @@ def _sync_transcribe(
|
|||||||
file=f,
|
file=f,
|
||||||
language="en",
|
language="en",
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
response_format="verbose_json",
|
response_format="verbose_json" if use_verbose else "json",
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
# Filter hallucinated segments. Two sources of hallucination in P25 recordings:
|
|
||||||
#
|
if use_verbose:
|
||||||
# 1. Trailing silence / static — Whisper fills silence past real content with
|
# Filter hallucinated segments. Two sources of hallucination in P25 recordings:
|
||||||
# sequential radio codes (10-4, 10-5...). Clamped by audio duration.
|
#
|
||||||
#
|
# 1. Trailing silence / static — Whisper fills silence past real content with
|
||||||
# 2. Leading silence — OP25 recordings typically have a short silence at the
|
# sequential radio codes (10-4, 10-5...). Clamped by audio duration.
|
||||||
# start before the first PTT press. Whisper sometimes hallucinates filler
|
#
|
||||||
# words or codes over this silence. Detected via no_speech_prob > 0.8
|
# 2. Leading silence — OP25 recordings typically have a short silence at the
|
||||||
# (Whisper's own confidence that a segment contains no real speech).
|
# start before the first PTT press. Whisper sometimes hallucinates filler
|
||||||
audio_duration: float = getattr(response, "duration", None) or float("inf")
|
# words or codes over this silence. Detected via no_speech_prob > 0.8
|
||||||
segments = [
|
# (Whisper's own confidence that a segment contains no real speech).
|
||||||
{"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()}
|
audio_duration: float = getattr(response, "duration", None) or float("inf")
|
||||||
for s in (response.segments or [])
|
segments = [
|
||||||
if s.text.strip()
|
{"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()}
|
||||||
and s.start < audio_duration
|
for s in (response.segments or [])
|
||||||
and getattr(s, "no_speech_prob", 0.0) < 0.8
|
if s.text.strip()
|
||||||
]
|
and s.start < audio_duration
|
||||||
# Reconstruct text from non-hallucinated segments only so the two stay
|
and getattr(s, "no_speech_prob", 0.0) < 0.8
|
||||||
# in sync. If every segment was filtered (e.g. pure static or repeated
|
]
|
||||||
# prompt-word hallucination like "Standby. Standby. Standby..."), text
|
# Reconstruct text from non-hallucinated segments only so the two stay
|
||||||
# becomes None which prevents the intelligence pipeline from running on
|
# in sync. If every segment was filtered, text becomes None which prevents
|
||||||
# hallucinated content.
|
# the intelligence pipeline from running on hallucinated content.
|
||||||
text = " ".join(s["text"] for s in segments) or None
|
text = " ".join(s["text"] for s in segments) or None
|
||||||
return text, segments
|
return text, segments
|
||||||
|
else:
|
||||||
|
# json format returns just {"text": "..."} — no segments or timestamps.
|
||||||
|
# Intelligence extraction falls back to treating the whole transcript as one block.
|
||||||
|
text = (response.text or "").strip() or None
|
||||||
|
return text, []
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user