diff --git a/drb-c2-core/app/internal/vocabulary_learner.py b/drb-c2-core/app/internal/vocabulary_learner.py index d90df7b..3b6e84d 100644 --- a/drb-c2-core/app/internal/vocabulary_learner.py +++ b/drb-c2-core/app/internal/vocabulary_learner.py @@ -18,6 +18,7 @@ import asyncio import difflib import json import random +import re from datetime import datetime, timezone, timedelta from typing import Optional from app.internal.logger import logger @@ -297,6 +298,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None: random.shuffle(all_calls) char_budget = settings.vocabulary_induction_sample_tokens * 4 transcript_block = "" + sampled_call_docs: list[dict] = [] sampled = 0 for call in all_calls: text = call.get("transcript_corrected") or call.get("transcript") or "" @@ -306,6 +308,7 @@ async def _induct_system(system_id: str, system_doc: dict) -> None: break tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}" transcript_block += f"[{tg}] {text}\n" + sampled_call_docs.append(call) sampled += 1 if sampled < 3: @@ -322,11 +325,16 @@ async def _induct_system(system_id: str, system_doc: dict) -> None: pending_lower = {p["term"].lower() for p in existing_pending} vocab_lower = {t.lower() for t in existing_vocab} - to_queue = [ - {"term": t, "source": "induction", "added_at": now} - for t in new_terms - if t.lower() not in vocab_lower and t.lower() not in pending_lower - ] + to_queue = [] + for t in new_terms: + if t.lower() in vocab_lower or t.lower() in pending_lower: + continue + to_queue.append({ + "term": t, + "source": "induction", + "added_at": now, + "source_call_ids": _find_source_calls(t, sampled_call_docs), + }) if not to_queue: return @@ -343,6 +351,30 @@ async def _induct_system(system_id: str, system_doc: dict) -> None: # Internal sync helpers # ───────────────────────────────────────────────────────────────────────────── +def _find_source_calls(term: str, sampled_calls: list[dict], max_results: int = 3) -> list[str]: + """ + Find which sampled calls most likely produced this induction suggestion. + Splits the proposed term into tokens and searches call transcripts for overlap. + Falls back to the first two sampled calls when no token match is found + (e.g. fully garbled terms like "why vac" → "YVAC" have no word overlap). + """ + tokens = [t.lower() for t in re.split(r"[^a-zA-Z0-9]+", term) if len(t) >= 2] + matched: list[str] = [] + if tokens: + for call in sampled_calls: + call_id = call.get("call_id") + if not call_id: + continue + text = (call.get("transcript_corrected") or call.get("transcript") or "").lower() + if any(tok in text for tok in tokens): + matched.append(call_id) + if len(matched) >= max_results: + break + if not matched: + matched = [c["call_id"] for c in sampled_calls[:2] if c.get("call_id")] + return matched + + _STOP_WORDS = { "the", "and", "for", "are", "was", "were", "this", "that", "with", "have", "has", "had", "but", "not", "from", "they", "will", "what", diff --git a/drb-frontend/app/systems/page.tsx b/drb-frontend/app/systems/page.tsx index ba4b0e8..427ce2b 100644 --- a/drb-frontend/app/systems/page.tsx +++ b/drb-frontend/app/systems/page.tsx @@ -829,6 +829,54 @@ function AiFlagsPanel({ systemId, initial }: { systemId: string; initial: System ); } +// ── Source call audio player ────────────────────────────────────────────────── + +function SourceCallPlayer({ callId }: { callId: string }) { + const [call, setCall] = useState<{ audio_url?: string | null; transcript?: string | null; transcript_corrected?: string | null } | null>(null); + const [loading, setLoading] = useState(false); + const [open, setOpen] = useState(false); + + async function toggle() { + if (!open && !call) { + setLoading(true); + try { + const c = await c2api.getCall(callId); + setCall(c as typeof call); + } finally { + setLoading(false); + } + } + setOpen((v) => !v); + } + + const transcript = call?.transcript_corrected || call?.transcript; + + return ( +
No audio
+ )} + {transcript && ( +{transcript}
+ )} +Induction suggestions ({pending.length})
-