Start to learn vocab from talkgroups to improve accuracy of STT
This commit is contained in:
@@ -28,6 +28,10 @@ class Settings(BaseSettings):
|
||||
location_proximity_km: float = 0.5 # radius for location-proximity matching
|
||||
incident_auto_resolve_minutes: int = 90 # auto-resolve after N minutes with no new calls
|
||||
|
||||
# Vocabulary learning
|
||||
vocabulary_induction_interval_hours: int = 24 # how often the induction loop runs
|
||||
vocabulary_induction_sample_tokens: int = 4000 # ~tokens of transcript text sampled per system
|
||||
|
||||
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase
|
||||
service_key: Optional[str] = None
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ Rules:
|
||||
|
||||
System: {system_id}
|
||||
Talkgroup: {talkgroup_name}
|
||||
{transcript_block}"""
|
||||
{vocabulary_block}{transcript_block}"""
|
||||
|
||||
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
|
||||
_GEO_DELTA = 0.1
|
||||
@@ -76,8 +76,15 @@ async def extract_tags(
|
||||
Side-effect: updates calls/{call_id} in Firestore with tags, location,
|
||||
location_coords, vehicles, units, severity, transcript_corrected; also stores embedding.
|
||||
"""
|
||||
# Load per-system vocabulary for prompt injection
|
||||
vocabulary: list[str] = []
|
||||
if system_id:
|
||||
from app.internal.vocabulary_learner import get_vocabulary
|
||||
vocab_data = await get_vocabulary(system_id)
|
||||
vocabulary = vocab_data.get("vocabulary") or []
|
||||
|
||||
result = await asyncio.to_thread(
|
||||
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments
|
||||
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary
|
||||
)
|
||||
|
||||
tags: list[str] = result.get("tags") or []
|
||||
@@ -233,6 +240,7 @@ def _sync_extract(
|
||||
talkgroup_id: Optional[int],
|
||||
system_id: Optional[str],
|
||||
segments: Optional[list[dict]],
|
||||
vocabulary: Optional[list[str]] = None,
|
||||
) -> dict:
|
||||
"""Call GPT-4o mini and parse the JSON response."""
|
||||
from app.config import settings
|
||||
@@ -242,11 +250,13 @@ def _sync_extract(
|
||||
logger.warning("OPENAI_API_KEY not set — intelligence extraction disabled.")
|
||||
return {}
|
||||
|
||||
from app.internal.vocabulary_learner import build_gpt_vocab_block
|
||||
tg = f"{talkgroup_name} (TGID {talkgroup_id})" if talkgroup_id else (talkgroup_name or "unknown")
|
||||
prompt = _PROMPT_TEMPLATE.format(
|
||||
transcript_block=_build_transcript_block(transcript, segments),
|
||||
talkgroup_name=tg,
|
||||
system_id=system_id or "unknown",
|
||||
vocabulary_block=build_gpt_vocab_block(vocabulary or []),
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
@@ -28,6 +28,7 @@ async def transcribe_call(
|
||||
call_id: str,
|
||||
gcs_uri: str,
|
||||
talkgroup_name: Optional[str] = None,
|
||||
system_id: Optional[str] = None,
|
||||
) -> tuple[Optional[str], list[dict]]:
|
||||
"""
|
||||
Transcribe audio at the given GCS URI and store the result in Firestore.
|
||||
@@ -39,8 +40,17 @@ async def transcribe_call(
|
||||
if not gcs_uri or not gcs_uri.startswith("gs://"):
|
||||
return None, []
|
||||
|
||||
# Load vocabulary for this system (empty list if none yet)
|
||||
vocabulary: list[str] = []
|
||||
if system_id:
|
||||
from app.internal.vocabulary_learner import get_vocabulary
|
||||
vocab_data = await get_vocabulary(system_id)
|
||||
vocabulary = vocab_data.get("vocabulary") or []
|
||||
|
||||
try:
|
||||
transcript, segments = await asyncio.to_thread(_sync_transcribe, gcs_uri, talkgroup_name)
|
||||
transcript, segments = await asyncio.to_thread(
|
||||
_sync_transcribe, gcs_uri, talkgroup_name, vocabulary
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Transcription failed for call {call_id}: {e}")
|
||||
return None, []
|
||||
@@ -61,7 +71,11 @@ async def transcribe_call(
|
||||
return transcript, segments
|
||||
|
||||
|
||||
def _sync_transcribe(gcs_uri: str, talkgroup_name: Optional[str] = None) -> tuple[Optional[str], list[dict]]:
|
||||
def _sync_transcribe(
|
||||
gcs_uri: str,
|
||||
talkgroup_name: Optional[str] = None,
|
||||
vocabulary: Optional[list[str]] = None,
|
||||
) -> tuple[Optional[str], list[dict]]:
|
||||
"""Download audio from GCS and transcribe with OpenAI Whisper."""
|
||||
from google.cloud import storage as gcs
|
||||
from google.oauth2 import service_account
|
||||
@@ -94,7 +108,10 @@ def _sync_transcribe(gcs_uri: str, talkgroup_name: Optional[str] = None) -> tupl
|
||||
try:
|
||||
blob.download_to_filename(tmp_path)
|
||||
|
||||
prompt = (f"Talkgroup: {talkgroup_name}. " + _WHISPER_PROMPT) if talkgroup_name else _WHISPER_PROMPT
|
||||
from app.internal.vocabulary_learner import build_whisper_vocab_prompt
|
||||
vocab_prefix = build_whisper_vocab_prompt(vocabulary or [])
|
||||
tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
|
||||
prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT
|
||||
|
||||
openai_client = OpenAI(api_key=settings.openai_api_key)
|
||||
with open(tmp_path, "rb") as f:
|
||||
|
||||
@@ -0,0 +1,426 @@
|
||||
"""
|
||||
Per-system vocabulary learning for STT accuracy improvement.
|
||||
|
||||
Three mechanisms:
|
||||
1. Bootstrap — one-shot GPT-4o call generates local knowledge at system setup:
|
||||
agencies + abbreviations, unit naming, streets, acronyms.
|
||||
2. Correction — diffs admin transcript edits, extracts corrected tokens → vocabulary.
|
||||
3. Induction — background loop samples N tokens of transcripts per system,
|
||||
asks GPT-4o-mini to propose new terms → queued as pending for review.
|
||||
|
||||
Firestore schema additions on system documents:
|
||||
vocabulary: list[str] — approved terms; injected into Whisper + GPT prompts
|
||||
vocabulary_pending: list[dict] — induction suggestions awaiting admin review
|
||||
each: {term, source, added_at}
|
||||
vocabulary_bootstrapped: bool — bootstrap has been run at least once
|
||||
"""
|
||||
import asyncio
|
||||
import difflib
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
from app.internal.logger import logger
|
||||
from app.internal import firestore as fstore
|
||||
from app.config import settings
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Prompt templates
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_BOOTSTRAP_PROMPT = """\
|
||||
You are building a radio vocabulary dictionary to improve speech-to-text accuracy for a P25 \
|
||||
public-safety radio monitoring system in a specific area. The STT model has no local knowledge, \
|
||||
so common terms like "YVAC" get transcribed as "why vac", "5-baker" as "5 acre", etc.
|
||||
|
||||
System name: {system_name}
|
||||
System type: {system_type}
|
||||
Area context: {area_hint}
|
||||
|
||||
Return ONLY a JSON object:
|
||||
{{"vocabulary": [list of strings]}}
|
||||
|
||||
Include terms you are confident about for this area:
|
||||
- Agency names and their radio abbreviations (e.g. "YVAC" = Yorktown Volunteer Ambulance Corps)
|
||||
- Unit ID examples using the local naming convention (e.g. "5-baker", "5-charlie", "1-david";
|
||||
many departments use APCO phonetics: adam, baker, charles, david, edward, frank, george,
|
||||
henry, ida, john, king, lincoln, mary, nora, ocean, paul, queen, robert, sam, tom, union,
|
||||
victor, william, x-ray, young, zebra)
|
||||
- Major routes, roads, and key intersections
|
||||
- Local landmarks and geographic references dispatchers use
|
||||
- Agency-specific codes that differ from standard APCO
|
||||
|
||||
Return a flat list of strings — abbreviations, proper names, unit IDs, street names.
|
||||
Do NOT include common English words. Max 80 terms. Only include what you are confident is \
|
||||
accurate for this specific area; return fewer terms rather than guessing."""
|
||||
|
||||
_INDUCTION_PROMPT = """\
|
||||
You are analyzing P25 emergency radio transcripts to find vocabulary terms that should be \
|
||||
added to improve future speech-to-text accuracy for this system.
|
||||
|
||||
System: {system_name}
|
||||
Existing approved vocabulary (do not re-propose these): {existing_vocab}
|
||||
|
||||
Sampled transcripts:
|
||||
{transcript_block}
|
||||
|
||||
Find terms that are LIKELY STT errors or local terms missing from the vocabulary:
|
||||
- Unit IDs that appear garbled (e.g. "5 acre" → "5-baker")
|
||||
- Agency acronyms spelled out phonetically (e.g. "why vac" → "YVAC")
|
||||
- Street names or locations that look misspelled or oddly transcribed
|
||||
- Callsigns or local codes not yet in the vocabulary
|
||||
|
||||
Return ONLY a JSON object:
|
||||
{{"new_terms": ["term1", "term2", ...]}}
|
||||
|
||||
Only include high-confidence additions not already in existing vocabulary.
|
||||
Return {{"new_terms": []}} if nothing new is found."""
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Public API
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def bootstrap_system_vocabulary(system_id: str) -> list[str]:
|
||||
"""
|
||||
One-shot GPT-4o bootstrap: generate local-knowledge vocabulary for a system.
|
||||
Merges generated terms into system.vocabulary and sets vocabulary_bootstrapped=True.
|
||||
Returns the list of newly generated terms.
|
||||
"""
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
logger.warning(f"Vocabulary bootstrap: system {system_id} not found")
|
||||
return []
|
||||
|
||||
system_name = system_doc.get("name", "Unknown")
|
||||
system_type = system_doc.get("type", "P25")
|
||||
|
||||
# Build area hint from configured talkgroup names
|
||||
talkgroups = system_doc.get("config", {}).get("talkgroups", [])
|
||||
tg_names = [tg.get("name", "") for tg in talkgroups if tg.get("name")][:8]
|
||||
area_hint = f"Talkgroups include: {', '.join(tg_names)}" if tg_names else "Unknown area"
|
||||
|
||||
terms = await asyncio.to_thread(_sync_bootstrap, system_name, system_type, area_hint)
|
||||
if not terms:
|
||||
return []
|
||||
|
||||
existing = system_doc.get("vocabulary") or []
|
||||
existing_lower = {t.lower() for t in existing}
|
||||
to_add = [t for t in terms if t.lower() not in existing_lower]
|
||||
merged = list(dict.fromkeys(existing + to_add))
|
||||
|
||||
await fstore.doc_set("systems", system_id, {
|
||||
"vocabulary": merged,
|
||||
"vocabulary_bootstrapped": True,
|
||||
})
|
||||
logger.info(
|
||||
f"Vocabulary bootstrap: {len(to_add)} term(s) generated for system {system_id} "
|
||||
f"({system_name})"
|
||||
)
|
||||
return to_add
|
||||
|
||||
|
||||
async def learn_from_correction(system_id: str, original: str, corrected: str) -> None:
|
||||
"""
|
||||
Diff original and corrected transcripts; append new tokens to the approved vocabulary.
|
||||
Called automatically when an admin saves a transcript correction.
|
||||
"""
|
||||
if not system_id or not original or not corrected:
|
||||
return
|
||||
|
||||
new_terms = _diff_new_terms(original, corrected)
|
||||
if not new_terms:
|
||||
return
|
||||
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
return
|
||||
|
||||
existing = system_doc.get("vocabulary") or []
|
||||
existing_lower = {t.lower() for t in existing}
|
||||
to_add = [t for t in new_terms if t.lower() not in existing_lower]
|
||||
if not to_add:
|
||||
return
|
||||
|
||||
merged = list(dict.fromkeys(existing + to_add))
|
||||
await fstore.doc_set("systems", system_id, {"vocabulary": merged})
|
||||
logger.info(
|
||||
f"Vocabulary: learned {len(to_add)} term(s) from correction on system {system_id}: "
|
||||
f"{to_add}"
|
||||
)
|
||||
|
||||
|
||||
async def approve_pending_term(system_id: str, term: str) -> None:
|
||||
"""Move a pending term into the approved vocabulary."""
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
return
|
||||
pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
|
||||
vocab = system_doc.get("vocabulary") or []
|
||||
if term.lower() not in {t.lower() for t in vocab}:
|
||||
vocab = list(dict.fromkeys(vocab + [term]))
|
||||
await fstore.doc_set("systems", system_id, {
|
||||
"vocabulary": vocab,
|
||||
"vocabulary_pending": pending,
|
||||
})
|
||||
|
||||
|
||||
async def dismiss_pending_term(system_id: str, term: str) -> None:
|
||||
"""Remove a pending term without adding it to vocabulary."""
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
return
|
||||
pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
|
||||
await fstore.doc_set("systems", system_id, {"vocabulary_pending": pending})
|
||||
|
||||
|
||||
async def add_term(system_id: str, term: str) -> None:
|
||||
"""Manually add a term to the approved vocabulary."""
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
return
|
||||
vocab = system_doc.get("vocabulary") or []
|
||||
if term.lower() not in {t.lower() for t in vocab}:
|
||||
vocab = list(dict.fromkeys(vocab + [term.strip()]))
|
||||
await fstore.doc_set("systems", system_id, {"vocabulary": vocab})
|
||||
|
||||
|
||||
async def remove_term(system_id: str, term: str) -> None:
|
||||
"""Remove a term from the approved vocabulary."""
|
||||
system_doc = await fstore.doc_get("systems", system_id)
|
||||
if not system_doc:
|
||||
return
|
||||
vocab = [t for t in (system_doc.get("vocabulary") or []) if t.lower() != term.lower()]
|
||||
await fstore.doc_set("systems", system_id, {"vocabulary": vocab})
|
||||
|
||||
|
||||
async def get_vocabulary(system_id: str) -> dict:
|
||||
"""Return vocabulary and pending terms for a system."""
|
||||
doc = await fstore.doc_get("systems", system_id)
|
||||
if not doc:
|
||||
return {"vocabulary": [], "vocabulary_pending": [], "vocabulary_bootstrapped": False}
|
||||
return {
|
||||
"vocabulary": doc.get("vocabulary") or [],
|
||||
"vocabulary_pending": doc.get("vocabulary_pending") or [],
|
||||
"vocabulary_bootstrapped": doc.get("vocabulary_bootstrapped", False),
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Prompt-injection helpers (called by transcription.py and intelligence.py)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def build_whisper_vocab_prompt(vocabulary: list[str]) -> str:
|
||||
"""
|
||||
Format vocabulary for Whisper prompt injection.
|
||||
Whisper's prompt field acts as a context prior with a ~224-token limit.
|
||||
The base _WHISPER_PROMPT uses ~70 tokens; we budget ~150 tokens (≈550 chars) here.
|
||||
"""
|
||||
if not vocabulary:
|
||||
return ""
|
||||
char_budget = 550
|
||||
terms: list[str] = []
|
||||
used = 0
|
||||
for term in vocabulary:
|
||||
cost = len(term) + 2 # ", "
|
||||
if used + cost > char_budget:
|
||||
break
|
||||
terms.append(term)
|
||||
used += cost
|
||||
return ", ".join(terms) + ". " if terms else ""
|
||||
|
||||
|
||||
def build_gpt_vocab_block(vocabulary: list[str]) -> str:
|
||||
"""Format vocabulary for injection into GPT extraction prompts."""
|
||||
if not vocabulary:
|
||||
return ""
|
||||
return f"Known local terms: {', '.join(vocabulary)}\n"
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Background induction loop
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def vocabulary_induction_loop() -> None:
|
||||
interval = settings.vocabulary_induction_interval_hours * 3600
|
||||
logger.info(
|
||||
f"Vocabulary induction loop started — "
|
||||
f"interval: {settings.vocabulary_induction_interval_hours}h, "
|
||||
f"sample budget: {settings.vocabulary_induction_sample_tokens} tokens"
|
||||
)
|
||||
while True:
|
||||
await asyncio.sleep(interval)
|
||||
try:
|
||||
await _run_induction_pass()
|
||||
except Exception as e:
|
||||
logger.error(f"Vocabulary induction pass failed: {e}")
|
||||
|
||||
|
||||
async def _run_induction_pass() -> None:
|
||||
systems = await fstore.collection_list("systems")
|
||||
if not systems:
|
||||
return
|
||||
logger.info(f"Vocabulary induction: processing {len(systems)} system(s)")
|
||||
for system in systems:
|
||||
system_id = system.get("system_id")
|
||||
if system_id:
|
||||
try:
|
||||
await _induct_system(system_id, system)
|
||||
except Exception as e:
|
||||
logger.warning(f"Induction failed for system {system_id}: {e}")
|
||||
|
||||
|
||||
async def _induct_system(system_id: str, system_doc: dict) -> None:
|
||||
"""Sample random transcripts for a system and propose new vocabulary."""
|
||||
system_name = system_doc.get("name", "Unknown")
|
||||
existing_vocab: list[str] = system_doc.get("vocabulary") or []
|
||||
|
||||
# Fetch recent ended calls for this system
|
||||
all_calls = await fstore.collection_list("calls", system_id=system_id, status="ended")
|
||||
if not all_calls:
|
||||
return
|
||||
|
||||
# Random sample up to the token budget (4 chars ≈ 1 token)
|
||||
random.shuffle(all_calls)
|
||||
char_budget = settings.vocabulary_induction_sample_tokens * 4
|
||||
transcript_block = ""
|
||||
sampled = 0
|
||||
for call in all_calls:
|
||||
text = call.get("transcript_corrected") or call.get("transcript") or ""
|
||||
if not text:
|
||||
continue
|
||||
if len(transcript_block) + len(text) > char_budget:
|
||||
break
|
||||
tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}"
|
||||
transcript_block += f"[{tg}] {text}\n"
|
||||
sampled += 1
|
||||
|
||||
if sampled < 3:
|
||||
return # not enough data to learn from yet
|
||||
|
||||
new_terms = await asyncio.to_thread(
|
||||
_sync_induct, system_name, existing_vocab, transcript_block
|
||||
)
|
||||
if not new_terms:
|
||||
return
|
||||
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
existing_pending: list[dict] = system_doc.get("vocabulary_pending") or []
|
||||
pending_lower = {p["term"].lower() for p in existing_pending}
|
||||
vocab_lower = {t.lower() for t in existing_vocab}
|
||||
|
||||
to_queue = [
|
||||
{"term": t, "source": "induction", "added_at": now}
|
||||
for t in new_terms
|
||||
if t.lower() not in vocab_lower and t.lower() not in pending_lower
|
||||
]
|
||||
if not to_queue:
|
||||
return
|
||||
|
||||
await fstore.doc_set("systems", system_id, {
|
||||
"vocabulary_pending": existing_pending + to_queue,
|
||||
})
|
||||
logger.info(
|
||||
f"Vocabulary induction: {len(to_queue)} new term(s) proposed for "
|
||||
f"system {system_id} ({system_name}): {[p['term'] for p in to_queue]}"
|
||||
)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Internal sync helpers
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
_STOP_WORDS = {
|
||||
"the", "and", "for", "are", "was", "were", "this", "that", "with",
|
||||
"have", "has", "had", "but", "not", "from", "they", "will", "what",
|
||||
"can", "all", "been", "one", "two", "three", "four", "five", "six",
|
||||
"you", "out", "who", "get", "her", "him", "his", "its", "our", "my",
|
||||
"via", "per", "any", "now", "got", "she", "let", "did", "may", "yes",
|
||||
"sir", "say", "see", "too", "off", "how", "put", "set", "try", "back",
|
||||
"just", "like", "into", "than", "them", "then", "some", "also", "onto",
|
||||
"went", "over", "copy", "okay", "unit", "post", "road", "lane", "going",
|
||||
"being", "doing", "there", "their", "about", "would", "could", "should",
|
||||
"route", "north", "south", "east", "west", "avenue", "street", "drive",
|
||||
}
|
||||
|
||||
|
||||
def _diff_new_terms(original: str, corrected: str) -> list[str]:
|
||||
"""
|
||||
Token-level diff: find tokens in `corrected` that replaced or were inserted
|
||||
relative to `original`. These are the admin's intended spellings — good
|
||||
candidates for vocabulary.
|
||||
"""
|
||||
orig_tokens = original.split()
|
||||
corr_tokens = corrected.split()
|
||||
|
||||
matcher = difflib.SequenceMatcher(None,
|
||||
[t.lower() for t in orig_tokens],
|
||||
[t.lower() for t in corr_tokens],
|
||||
)
|
||||
new_terms: list[str] = []
|
||||
for tag, _i1, _i2, j1, j2 in matcher.get_opcodes():
|
||||
if tag in ("insert", "replace"):
|
||||
for tok in corr_tokens[j1:j2]:
|
||||
clean = tok.strip(".,!?;:()'\"").strip("-")
|
||||
if len(clean) >= 3 and clean.lower() not in _STOP_WORDS:
|
||||
new_terms.append(clean)
|
||||
|
||||
return list(dict.fromkeys(new_terms))
|
||||
|
||||
|
||||
def _sync_bootstrap(system_name: str, system_type: str, area_hint: str) -> list[str]:
|
||||
from app.config import settings as cfg
|
||||
from openai import OpenAI
|
||||
|
||||
if not cfg.openai_api_key:
|
||||
return []
|
||||
|
||||
prompt = _BOOTSTRAP_PROMPT.format(
|
||||
system_name=system_name,
|
||||
system_type=system_type,
|
||||
area_hint=area_hint,
|
||||
)
|
||||
try:
|
||||
client = OpenAI(api_key=cfg.openai_api_key)
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
data = json.loads(response.choices[0].message.content)
|
||||
terms = data.get("vocabulary") or []
|
||||
return [str(t).strip() for t in terms if str(t).strip()]
|
||||
except Exception as e:
|
||||
logger.warning(f"Vocabulary bootstrap GPT call failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _sync_induct(
|
||||
system_name: str, existing_vocab: list[str], transcript_block: str
|
||||
) -> list[str]:
|
||||
from app.config import settings as cfg
|
||||
from openai import OpenAI
|
||||
|
||||
if not cfg.openai_api_key:
|
||||
return []
|
||||
|
||||
vocab_str = ", ".join(existing_vocab[:80]) if existing_vocab else "(none yet)"
|
||||
prompt = _INDUCTION_PROMPT.format(
|
||||
system_name=system_name,
|
||||
existing_vocab=vocab_str,
|
||||
transcript_block=transcript_block[:8000],
|
||||
)
|
||||
try:
|
||||
client = OpenAI(api_key=cfg.openai_api_key)
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
data = json.loads(response.choices[0].message.content)
|
||||
terms = data.get("new_terms") or []
|
||||
return [str(t).strip() for t in terms if str(t).strip()]
|
||||
except Exception as e:
|
||||
logger.warning(f"Vocabulary induction GPT call failed: {e}")
|
||||
return []
|
||||
@@ -6,6 +6,7 @@ from app.internal.logger import logger
|
||||
from app.internal.mqtt_handler import mqtt_handler
|
||||
from app.internal.node_sweeper import sweeper_loop
|
||||
from app.internal.summarizer import summarizer_loop
|
||||
from app.internal.vocabulary_learner import vocabulary_induction_loop
|
||||
from app.config import settings
|
||||
from app.internal.auth import require_firebase_token, require_service_or_firebase_token
|
||||
from app.routers import nodes, systems, calls, upload, tokens, incidents, alerts
|
||||
@@ -35,14 +36,16 @@ async def lifespan(app: FastAPI):
|
||||
await _release_orphaned_tokens()
|
||||
|
||||
await mqtt_handler.connect()
|
||||
sweeper_task = asyncio.create_task(sweeper_loop())
|
||||
sweeper_task = asyncio.create_task(sweeper_loop())
|
||||
summarizer_task = asyncio.create_task(summarizer_loop())
|
||||
induction_task = asyncio.create_task(vocabulary_induction_loop())
|
||||
|
||||
yield # --- app running ---
|
||||
|
||||
logger.info("DRB C2 Core shutting down.")
|
||||
sweeper_task.cancel()
|
||||
summarizer_task.cancel()
|
||||
induction_task.cancel()
|
||||
await mqtt_handler.disconnect()
|
||||
|
||||
|
||||
|
||||
@@ -83,6 +83,13 @@ async def patch_transcript(
|
||||
"embedding": None,
|
||||
})
|
||||
|
||||
# Learn from the correction: diff original → corrected and add new tokens to vocabulary
|
||||
system_id = call.get("system_id")
|
||||
original_text = call.get("transcript_corrected") or call.get("transcript") or ""
|
||||
if system_id and original_text:
|
||||
from app.internal.vocabulary_learner import learn_from_correction
|
||||
await learn_from_correction(system_id, original_text, body.transcript)
|
||||
|
||||
from app.routers.upload import _run_extraction_pipeline
|
||||
background_tasks.add_task(
|
||||
_run_extraction_pipeline,
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
import uuid
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from app.models import SystemCreate, SystemRecord
|
||||
from app.internal import firestore as fstore
|
||||
|
||||
router = APIRouter(prefix="/systems", tags=["systems"])
|
||||
|
||||
|
||||
class VocabularyTermBody(BaseModel):
|
||||
term: str
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_systems():
|
||||
return await fstore.collection_list("systems")
|
||||
@@ -42,3 +48,70 @@ async def delete_system(system_id: str):
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
await fstore.doc_delete("systems", system_id)
|
||||
|
||||
|
||||
# ── Vocabulary endpoints ───────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/{system_id}/vocabulary")
|
||||
async def get_vocabulary(system_id: str):
|
||||
"""Return approved vocabulary and pending induction suggestions."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import get_vocabulary as _get
|
||||
return await _get(system_id)
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/bootstrap", status_code=202)
|
||||
async def bootstrap_vocabulary(system_id: str):
|
||||
"""Trigger a one-shot GPT-4o bootstrap to seed the vocabulary from local knowledge."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import bootstrap_system_vocabulary
|
||||
terms = await bootstrap_system_vocabulary(system_id)
|
||||
return {"added": len(terms), "terms": terms}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/terms")
|
||||
async def add_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||
"""Manually add a term to the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import add_term
|
||||
await add_term(system_id, body.term.strip())
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.delete("/{system_id}/vocabulary/terms")
|
||||
async def remove_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||
"""Remove a term from the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import remove_term
|
||||
await remove_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/pending/approve")
|
||||
async def approve_pending(system_id: str, body: VocabularyTermBody):
|
||||
"""Move a pending induction suggestion into the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import approve_pending_term
|
||||
await approve_pending_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/pending/dismiss")
|
||||
async def dismiss_pending(system_id: str, body: VocabularyTermBody):
|
||||
"""Dismiss a pending induction suggestion without adding it."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import dismiss_pending_term
|
||||
await dismiss_pending_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
@@ -151,7 +151,9 @@ async def _run_intelligence_pipeline(
|
||||
|
||||
# Step 1: Transcription
|
||||
if gcs_uri:
|
||||
transcript, segments = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name)
|
||||
transcript, segments = await transcription.transcribe_call(
|
||||
call_id, gcs_uri, talkgroup_name, system_id=system_id
|
||||
)
|
||||
|
||||
# Step 2: Intelligence extraction
|
||||
tags: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user