Add source call audio playback to vocabulary suggestions

When the induction loop proposes a new vocabulary term, it now records
which sampled call(s) most likely produced the suggestion. Admins see
a collapsible "▶ source" player under each pending term showing the
audio clip and transcript, so they can hear what was actually said
before approving or dismissing.

- vocabulary_learner: track sampled call docs, attach source_call_ids
  to each pending term via word-overlap search with fallback
- types: VocabularyPendingTerm.source_call_ids?: string[]
- c2api: add getCall(id) using existing GET /calls/{call_id} endpoint
- VocabularyPanel: SourceCallPlayer component — lazy-loads call on
  first expand, shows audio controls + transcript snippet
This commit is contained in:
Logan
2026-06-01 01:45:03 -04:00
parent 032eef311f
commit 913fe0cbee
4 changed files with 102 additions and 11 deletions
+37 -5
View File
@@ -18,6 +18,7 @@ import asyncio
import difflib
import json
import random
import re
from datetime import datetime, timezone, timedelta
from typing import Optional
from app.internal.logger import logger
@@ -297,6 +298,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
random.shuffle(all_calls)
char_budget = settings.vocabulary_induction_sample_tokens * 4
transcript_block = ""
sampled_call_docs: list[dict] = []
sampled = 0
for call in all_calls:
text = call.get("transcript_corrected") or call.get("transcript") or ""
@@ -306,6 +308,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
break
tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}"
transcript_block += f"[{tg}] {text}\n"
sampled_call_docs.append(call)
sampled += 1
if sampled < 3:
@@ -322,11 +325,16 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
pending_lower = {p["term"].lower() for p in existing_pending}
vocab_lower = {t.lower() for t in existing_vocab}
to_queue = [
{"term": t, "source": "induction", "added_at": now}
for t in new_terms
if t.lower() not in vocab_lower and t.lower() not in pending_lower
]
to_queue = []
for t in new_terms:
if t.lower() in vocab_lower or t.lower() in pending_lower:
continue
to_queue.append({
"term": t,
"source": "induction",
"added_at": now,
"source_call_ids": _find_source_calls(t, sampled_call_docs),
})
if not to_queue:
return
@@ -343,6 +351,30 @@ async def _induct_system(system_id: str, system_doc: dict) -> None:
# Internal sync helpers
# ─────────────────────────────────────────────────────────────────────────────
def _find_source_calls(term: str, sampled_calls: list[dict], max_results: int = 3) -> list[str]:
"""
Find which sampled calls most likely produced this induction suggestion.
Splits the proposed term into tokens and searches call transcripts for overlap.
Falls back to the first two sampled calls when no token match is found
(e.g. fully garbled terms like "why vac""YVAC" have no word overlap).
"""
tokens = [t.lower() for t in re.split(r"[^a-zA-Z0-9]+", term) if len(t) >= 2]
matched: list[str] = []
if tokens:
for call in sampled_calls:
call_id = call.get("call_id")
if not call_id:
continue
text = (call.get("transcript_corrected") or call.get("transcript") or "").lower()
if any(tok in text for tok in tokens):
matched.append(call_id)
if len(matched) >= max_results:
break
if not matched:
matched = [c["call_id"] for c in sampled_calls[:2] if c.get("call_id")]
return matched
_STOP_WORDS = {
"the", "and", "for", "are", "was", "were", "this", "that", "with",
"have", "has", "had", "but", "not", "from", "they", "will", "what",