fix: raise garbage-transcript threshold to avoid false positives on plate reads

Phonetic run threshold 5 → 12: a plate spellout ("Foxtrot Alpha Uniform Lima Kilo...") produces 6–8 consecutive phonetic words, triggering false positives and blocking intelligence extraction on legitimate calls. 12 is safely above any real spellout (~8 max) while still catching the full-alphabet hallucination (26 words). Also writes skip_reason="garbage_transcript" to the call doc and surfaces it in the admin correlation debug endpoint.
2026-05-25 03:31:43 -04:00
parent 92c9d8effc
commit 7dd090e8b2
2 changed files with 8 additions and 1 deletions
@@ -94,11 +94,13 @@ def _is_garbage_transcript(transcript: str) -> bool:
    if not words:
        return False

+    # Threshold of 12: well above any legitimate plate/name spellout (~6–8 words)
+    # but catches the full-alphabet hallucination (26 words in sequence).
    run = 0
    for w in words:
        if w in _PHONETIC_ALPHA_WORDS:
            run += 1
-            if run >= 5:
+            if run >= 12:
                return True
        else:
            run = 0
@@ -154,6 +156,10 @@ async def extract_scenes(
            f"Intelligence: call {call_id} — garbage transcript detected "
            f"(Whisper hallucination), skipping extraction"
        )
+        try:
+            await fstore.doc_set("calls", call_id, {"skip_reason": "garbage_transcript"})
+        except Exception:
+            pass
        return []

    raw_scenes: list[dict] = await asyncio.to_thread(