Add source call audio playback to vocabulary suggestions

When the induction loop proposes a new vocabulary term, it now records which sampled call(s) most likely produced the suggestion. Admins see a collapsible "▶ source" player under each pending term showing the audio clip and transcript, so they can hear what was actually said before approving or dismissing. - vocabulary_learner: track sampled call docs, attach source_call_ids to each pending term via word-overlap search with fallback - types: VocabularyPendingTerm.source_call_ids?: string[] - c2api: add getCall(id) using existing GET /calls/{call_id} endpoint - VocabularyPanel: SourceCallPlayer component — lazy-loads call on first expand, shows audio controls + transcript snippet
2026-06-01 01:45:03 -04:00
parent 032eef311f
commit 913fe0cbee
4 changed files with 102 additions and 11 deletions
@@ -18,6 +18,7 @@ import asyncio
 import difflib
 import json
 import random
+import re
 from datetime import datetime, timezone, timedelta
 from typing import Optional
 from app.internal.logger import logger
@@ -297,6 +298,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
    random.shuffle(all_calls)
    char_budget = settings.vocabulary_induction_sample_tokens * 4
    transcript_block = ""
+    sampled_call_docs: list[dict] = []
    sampled = 0
    for call in all_calls:
        text = call.get("transcript_corrected") or call.get("transcript") or ""
@@ -306,6 +308,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
            break
        tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}"
        transcript_block += f"[{tg}] {text}\n"
+        sampled_call_docs.append(call)
        sampled += 1

    if sampled < 3:
@@ -322,11 +325,16 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
    pending_lower = {p["term"].lower() for p in existing_pending}
    vocab_lower   = {t.lower() for t in existing_vocab}

-    to_queue = [
-        {"term": t, "source": "induction", "added_at": now}
-        for t in new_terms
-        if t.lower() not in vocab_lower and t.lower() not in pending_lower
-    ]
+    to_queue = []
+    for t in new_terms:
+        if t.lower() in vocab_lower or t.lower() in pending_lower:
+            continue
+        to_queue.append({
+            "term": t,
+            "source": "induction",
+            "added_at": now,
+            "source_call_ids": _find_source_calls(t, sampled_call_docs),
+        })
    if not to_queue:
        return

@@ -343,6 +351,30 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
 # Internal sync helpers
 # ─────────────────────────────────────────────────────────────────────────────

+def _find_source_calls(term: str, sampled_calls: list[dict], max_results: int = 3) -> list[str]:
+    """
+    Find which sampled calls most likely produced this induction suggestion.
+    Splits the proposed term into tokens and searches call transcripts for overlap.
+    Falls back to the first two sampled calls when no token match is found
+    (e.g. fully garbled terms like "why vac" → "YVAC" have no word overlap).
+    """
+    tokens = [t.lower() for t in re.split(r"[^a-zA-Z0-9]+", term) if len(t) >= 2]
+    matched: list[str] = []
+    if tokens:
+        for call in sampled_calls:
+            call_id = call.get("call_id")
+            if not call_id:
+                continue
+            text = (call.get("transcript_corrected") or call.get("transcript") or "").lower()
+            if any(tok in text for tok in tokens):
+                matched.append(call_id)
+                if len(matched) >= max_results:
+                    break
+    if not matched:
+        matched = [c["call_id"] for c in sampled_calls[:2] if c.get("call_id")]
+    return matched
+
+
 _STOP_WORDS = {
    "the", "and", "for", "are", "was", "were", "this", "that", "with",
    "have", "has", "had", "but", "not", "from", "they", "will", "what",