fix: raise garbage-transcript threshold to avoid false positives on plate reads

Phonetic run threshold 5 → 12: a plate spellout ("Foxtrot Alpha Uniform Lima
Kilo...") produces 6–8 consecutive phonetic words, triggering false positives
and blocking intelligence extraction on legitimate calls. 12 is safely above
any real spellout (~8 max) while still catching the full-alphabet hallucination
(26 words). Also writes skip_reason="garbage_transcript" to the call doc and
surfaces it in the admin correlation debug endpoint.
This commit is contained in:
Logan
2026-05-25 03:31:43 -04:00
parent 92c9d8effc
commit 7dd090e8b2
2 changed files with 8 additions and 1 deletions
+7 -1
View File
@@ -94,11 +94,13 @@ def _is_garbage_transcript(transcript: str) -> bool:
if not words:
return False
# Threshold of 12: well above any legitimate plate/name spellout (~68 words)
# but catches the full-alphabet hallucination (26 words in sequence).
run = 0
for w in words:
if w in _PHONETIC_ALPHA_WORDS:
run += 1
if run >= 5:
if run >= 12:
return True
else:
run = 0
@@ -154,6 +156,10 @@ async def extract_scenes(
f"Intelligence: call {call_id} — garbage transcript detected "
f"(Whisper hallucination), skipping extraction"
)
try:
await fstore.doc_set("calls", call_id, {"skip_reason": "garbage_transcript"})
except Exception:
pass
return []
raw_scenes: list[dict] = await asyncio.to_thread(