Start to learn vocab from talkgroups to improve accuracy of STT

This commit is contained in:
Logan
2026-04-21 22:17:30 -04:00
parent 6612e4b683
commit 338b946ba3
11 changed files with 759 additions and 8 deletions
+12 -2
View File
@@ -36,7 +36,7 @@ Rules:
System: {system_id}
Talkgroup: {talkgroup_name}
{transcript_block}"""
{vocabulary_block}{transcript_block}"""
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
_GEO_DELTA = 0.1
@@ -76,8 +76,15 @@ async def extract_tags(
Side-effect: updates calls/{call_id} in Firestore with tags, location,
location_coords, vehicles, units, severity, transcript_corrected; also stores embedding.
"""
# Load per-system vocabulary for prompt injection
vocabulary: list[str] = []
if system_id:
from app.internal.vocabulary_learner import get_vocabulary
vocab_data = await get_vocabulary(system_id)
vocabulary = vocab_data.get("vocabulary") or []
result = await asyncio.to_thread(
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary
)
tags: list[str] = result.get("tags") or []
@@ -233,6 +240,7 @@ def _sync_extract(
talkgroup_id: Optional[int],
system_id: Optional[str],
segments: Optional[list[dict]],
vocabulary: Optional[list[str]] = None,
) -> dict:
"""Call GPT-4o mini and parse the JSON response."""
from app.config import settings
@@ -242,11 +250,13 @@ def _sync_extract(
logger.warning("OPENAI_API_KEY not set — intelligence extraction disabled.")
return {}
from app.internal.vocabulary_learner import build_gpt_vocab_block
tg = f"{talkgroup_name} (TGID {talkgroup_id})" if talkgroup_id else (talkgroup_name or "unknown")
prompt = _PROMPT_TEMPLATE.format(
transcript_block=_build_transcript_block(transcript, segments),
talkgroup_name=tg,
system_id=system_id or "unknown",
vocabulary_block=build_gpt_vocab_block(vocabulary or []),
)
try: