Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 65839a3191 | |||
| 338b946ba3 |
@@ -27,6 +27,11 @@ class Settings(BaseSettings):
|
|||||||
embedding_similarity_threshold: float = 0.93 # slow-path cosine threshold (tiebreaker only)
|
embedding_similarity_threshold: float = 0.93 # slow-path cosine threshold (tiebreaker only)
|
||||||
location_proximity_km: float = 0.5 # radius for location-proximity matching
|
location_proximity_km: float = 0.5 # radius for location-proximity matching
|
||||||
incident_auto_resolve_minutes: int = 90 # auto-resolve after N minutes with no new calls
|
incident_auto_resolve_minutes: int = 90 # auto-resolve after N minutes with no new calls
|
||||||
|
recorrelation_scan_minutes: int = 15 # re-examine orphaned calls ended within this window
|
||||||
|
|
||||||
|
# Vocabulary learning
|
||||||
|
vocabulary_induction_interval_hours: int = 24 # how often the induction loop runs
|
||||||
|
vocabulary_induction_sample_tokens: int = 4000 # ~tokens of transcript text sampled per system
|
||||||
|
|
||||||
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase
|
# Internal service key — allows server-side services (discord bot) to call C2 without Firebase
|
||||||
service_key: Optional[str] = None
|
service_key: Optional[str] = None
|
||||||
|
|||||||
@@ -57,6 +57,24 @@ async def collection_list(collection: str, **filters) -> list[dict]:
|
|||||||
return await asyncio.to_thread(_query)
|
return await asyncio.to_thread(_query)
|
||||||
|
|
||||||
|
|
||||||
|
async def collection_where(
|
||||||
|
collection: str,
|
||||||
|
conditions: list[tuple[str, str, Any]],
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Query a collection with arbitrary where-clauses.
|
||||||
|
conditions: list of (field, op, value) — e.g. [("ended_at", ">=", cutoff_dt)]
|
||||||
|
Supports any Firestore operator: "==", "!=", "<", "<=", ">", ">=".
|
||||||
|
"""
|
||||||
|
def _query():
|
||||||
|
ref = db.collection(collection)
|
||||||
|
for field, op, value in conditions:
|
||||||
|
ref = ref.where(field, op, value)
|
||||||
|
return [doc.to_dict() for doc in ref.stream()]
|
||||||
|
|
||||||
|
return await asyncio.to_thread(_query)
|
||||||
|
|
||||||
|
|
||||||
async def doc_delete(collection: str, doc_id: str) -> None:
|
async def doc_delete(collection: str, doc_id: str) -> None:
|
||||||
ref = db.collection(collection).document(doc_id)
|
ref = db.collection(collection).document(doc_id)
|
||||||
await asyncio.to_thread(ref.delete)
|
await asyncio.to_thread(ref.delete)
|
||||||
|
|||||||
@@ -46,17 +46,27 @@ async def correlate_call(
|
|||||||
incident_type: Optional[str],
|
incident_type: Optional[str],
|
||||||
location: Optional[str] = None,
|
location: Optional[str] = None,
|
||||||
location_coords: Optional[dict] = None,
|
location_coords: Optional[dict] = None,
|
||||||
|
reference_time: Optional[datetime] = None,
|
||||||
|
create_if_new: bool = True,
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Link call_id to an existing incident or create a new one.
|
Link call_id to an existing incident or create a new one.
|
||||||
Returns the incident_id, or None if skipped (no type and no talkgroup match).
|
|
||||||
|
reference_time — time anchor for the time-limited paths (location + slow).
|
||||||
|
Defaults to now. Pass call.started_at when re-correlating
|
||||||
|
orphaned calls so the window is anchored to when the call
|
||||||
|
actually happened, not when the sweep runs.
|
||||||
|
create_if_new — when False, skip new-incident creation (re-correlation only
|
||||||
|
links to existing incidents; it never creates new ones).
|
||||||
|
|
||||||
|
Returns the incident_id, or None if skipped.
|
||||||
"""
|
"""
|
||||||
now = datetime.now(timezone.utc)
|
now = reference_time or datetime.now(timezone.utc)
|
||||||
window = timedelta(hours=settings.correlation_window_hours)
|
window = timedelta(hours=settings.correlation_window_hours)
|
||||||
|
|
||||||
# Fetch all active incidents cross-type (mutual aid needs this)
|
# Fetch all active incidents cross-type (mutual aid needs this)
|
||||||
all_active = await fstore.collection_list("incidents", status="active")
|
all_active = await fstore.collection_list("incidents", status="active")
|
||||||
recent = [inc for inc in all_active if _within_window(inc, now, window)]
|
recent = [inc for inc in all_active if _within_window_of(inc, now, window)]
|
||||||
|
|
||||||
# Fetch call doc once — reused for disambiguation, embedding merge, unit accumulation
|
# Fetch call doc once — reused for disambiguation, embedding merge, unit accumulation
|
||||||
call_doc = await fstore.doc_get("calls", call_id) or {}
|
call_doc = await fstore.doc_get("calls", call_id) or {}
|
||||||
@@ -147,14 +157,14 @@ async def correlate_call(
|
|||||||
location, location_coords, call_units, call_vehicles, call_embedding, now,
|
location, location_coords, call_units, call_vehicles, call_embedding, now,
|
||||||
talkgroup_name=talkgroup_name, incident_type=incident_type,
|
talkgroup_name=talkgroup_name, incident_type=incident_type,
|
||||||
)
|
)
|
||||||
elif incident_type:
|
elif incident_type and create_if_new:
|
||||||
incident_id = await _create_incident(
|
incident_id = await _create_incident(
|
||||||
call_id, incident_type, talkgroup_id, talkgroup_name, system_id,
|
call_id, incident_type, talkgroup_id, talkgroup_name, system_id,
|
||||||
tags, location, location_coords,
|
tags, location, location_coords,
|
||||||
call_units, call_vehicles, call_embedding, call_severity, now,
|
call_units, call_vehicles, call_embedding, call_severity, now,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Unclassified call, no talkgroup match found — nothing to do
|
# No match and either no type or creation suppressed — nothing to do
|
||||||
return None
|
return None
|
||||||
|
|
||||||
await fstore.doc_set("calls", call_id, {"incident_id": incident_id})
|
await fstore.doc_set("calls", call_id, {"incident_id": incident_id})
|
||||||
@@ -165,14 +175,19 @@ async def correlate_call(
|
|||||||
# Internal helpers
|
# Internal helpers
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _within_window(inc: dict, now: datetime, window: timedelta) -> bool:
|
def _within_window_of(inc: dict, anchor: datetime, window: timedelta) -> bool:
|
||||||
|
"""
|
||||||
|
True if the incident's started_at is within `window` of `anchor` in either
|
||||||
|
direction. Using started_at (not updated_at) means re-correlation anchored
|
||||||
|
to a call's started_at correctly matches incidents created shortly *after*
|
||||||
|
that call (e.g. a welfare check at T+0 vs. an incident created at T+15m).
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
dt = datetime.fromisoformat(
|
raw = inc.get("started_at") or inc.get("updated_at") or ""
|
||||||
str(inc.get("updated_at", "")).replace("Z", "+00:00")
|
dt = datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
|
||||||
)
|
|
||||||
if dt.tzinfo is None:
|
if dt.tzinfo is None:
|
||||||
dt = dt.replace(tzinfo=timezone.utc)
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
return (now - dt) <= window
|
return abs((anchor - dt).total_seconds()) <= window.total_seconds()
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ Rules:
|
|||||||
|
|
||||||
System: {system_id}
|
System: {system_id}
|
||||||
Talkgroup: {talkgroup_name}
|
Talkgroup: {talkgroup_name}
|
||||||
{transcript_block}"""
|
{vocabulary_block}{transcript_block}"""
|
||||||
|
|
||||||
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
|
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
|
||||||
_GEO_DELTA = 0.1
|
_GEO_DELTA = 0.1
|
||||||
@@ -76,8 +76,15 @@ async def extract_tags(
|
|||||||
Side-effect: updates calls/{call_id} in Firestore with tags, location,
|
Side-effect: updates calls/{call_id} in Firestore with tags, location,
|
||||||
location_coords, vehicles, units, severity, transcript_corrected; also stores embedding.
|
location_coords, vehicles, units, severity, transcript_corrected; also stores embedding.
|
||||||
"""
|
"""
|
||||||
|
# Load per-system vocabulary for prompt injection
|
||||||
|
vocabulary: list[str] = []
|
||||||
|
if system_id:
|
||||||
|
from app.internal.vocabulary_learner import get_vocabulary
|
||||||
|
vocab_data = await get_vocabulary(system_id)
|
||||||
|
vocabulary = vocab_data.get("vocabulary") or []
|
||||||
|
|
||||||
result = await asyncio.to_thread(
|
result = await asyncio.to_thread(
|
||||||
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments
|
_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary
|
||||||
)
|
)
|
||||||
|
|
||||||
tags: list[str] = result.get("tags") or []
|
tags: list[str] = result.get("tags") or []
|
||||||
@@ -233,6 +240,7 @@ def _sync_extract(
|
|||||||
talkgroup_id: Optional[int],
|
talkgroup_id: Optional[int],
|
||||||
system_id: Optional[str],
|
system_id: Optional[str],
|
||||||
segments: Optional[list[dict]],
|
segments: Optional[list[dict]],
|
||||||
|
vocabulary: Optional[list[str]] = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Call GPT-4o mini and parse the JSON response."""
|
"""Call GPT-4o mini and parse the JSON response."""
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
@@ -242,11 +250,13 @@ def _sync_extract(
|
|||||||
logger.warning("OPENAI_API_KEY not set — intelligence extraction disabled.")
|
logger.warning("OPENAI_API_KEY not set — intelligence extraction disabled.")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
from app.internal.vocabulary_learner import build_gpt_vocab_block
|
||||||
tg = f"{talkgroup_name} (TGID {talkgroup_id})" if talkgroup_id else (talkgroup_name or "unknown")
|
tg = f"{talkgroup_name} (TGID {talkgroup_id})" if talkgroup_id else (talkgroup_name or "unknown")
|
||||||
prompt = _PROMPT_TEMPLATE.format(
|
prompt = _PROMPT_TEMPLATE.format(
|
||||||
transcript_block=_build_transcript_block(transcript, segments),
|
transcript_block=_build_transcript_block(transcript, segments),
|
||||||
talkgroup_name=tg,
|
talkgroup_name=tg,
|
||||||
system_id=system_id or "unknown",
|
system_id=system_id or "unknown",
|
||||||
|
vocabulary_block=build_gpt_vocab_block(vocabulary or []),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -0,0 +1,108 @@
|
|||||||
|
"""
|
||||||
|
Re-correlation sweep.
|
||||||
|
|
||||||
|
Runs every summary_interval_minutes (same tick as the summarizer). Each pass
|
||||||
|
finds calls that are:
|
||||||
|
- recently ended (ended_at within the last recorrelation_scan_minutes)
|
||||||
|
- still orphaned (incident_id is null)
|
||||||
|
|
||||||
|
and re-runs the incident correlator against currently-active incidents, using
|
||||||
|
the call's own started_at as the time anchor so the window is correct regardless
|
||||||
|
of when the sweep fires.
|
||||||
|
|
||||||
|
Never creates new incidents — link-only. Zero LLM tokens (uses pre-computed
|
||||||
|
talkgroup strings, haversine math, and stored embeddings).
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from typing import Optional
|
||||||
|
from app.internal.logger import logger
|
||||||
|
from app.internal import firestore as fstore
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
async def recorrelation_loop() -> None:
|
||||||
|
interval = settings.summary_interval_minutes * 60
|
||||||
|
logger.info(
|
||||||
|
f"Re-correlation sweep started — "
|
||||||
|
f"interval: {settings.summary_interval_minutes}m, "
|
||||||
|
f"scan window: {settings.recorrelation_scan_minutes}m"
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
try:
|
||||||
|
await _run_sweep_pass()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Re-correlation sweep failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_sweep_pass() -> None:
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(minutes=settings.recorrelation_scan_minutes)
|
||||||
|
|
||||||
|
# Server-side range query: only calls that ended within the scan window.
|
||||||
|
# Filter incident_id=null client-side (Firestore can't query for missing fields).
|
||||||
|
# This keeps the fetched set small regardless of total collection size.
|
||||||
|
recent_ended = await fstore.collection_where("calls", [
|
||||||
|
("status", "==", "ended"),
|
||||||
|
("ended_at", ">=", cutoff),
|
||||||
|
])
|
||||||
|
orphans = [c for c in recent_ended if not c.get("incident_id")]
|
||||||
|
|
||||||
|
if not orphans:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Re-correlation sweep: {len(orphans)} orphaned call(s) to check")
|
||||||
|
linked = 0
|
||||||
|
for call in orphans:
|
||||||
|
if await _recorrelate_orphan(call):
|
||||||
|
linked += 1
|
||||||
|
|
||||||
|
if linked:
|
||||||
|
logger.info(f"Re-correlation sweep: linked {linked}/{len(orphans)} orphaned call(s)")
|
||||||
|
|
||||||
|
|
||||||
|
async def _recorrelate_orphan(call: dict) -> bool:
|
||||||
|
"""
|
||||||
|
Attempt to link a single orphaned call to an existing incident.
|
||||||
|
Returns True if a match was found and the call was linked.
|
||||||
|
"""
|
||||||
|
from app.internal import incident_correlator
|
||||||
|
|
||||||
|
call_id = call.get("call_id")
|
||||||
|
started_at = _parse_dt(call.get("started_at"))
|
||||||
|
if not call_id or not started_at:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# All data needed for correlation was stored by the first-pass extraction.
|
||||||
|
incident_id = await incident_correlator.correlate_call(
|
||||||
|
call_id = call_id,
|
||||||
|
node_id = call.get("node_id", ""),
|
||||||
|
system_id = call.get("system_id"),
|
||||||
|
talkgroup_id = call.get("talkgroup_id"),
|
||||||
|
talkgroup_name = call.get("talkgroup_name"),
|
||||||
|
tags = call.get("tags") or [],
|
||||||
|
incident_type = call.get("incident_type"),
|
||||||
|
location = call.get("location"),
|
||||||
|
location_coords= call.get("location_coords"),
|
||||||
|
reference_time = started_at, # anchor window to when the call happened
|
||||||
|
create_if_new = False, # never create — link-only
|
||||||
|
)
|
||||||
|
|
||||||
|
if incident_id:
|
||||||
|
logger.info(
|
||||||
|
f"Re-correlation: linked orphaned call {call_id} → incident {incident_id}"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_dt(value) -> Optional[datetime]:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
@@ -28,6 +28,7 @@ async def transcribe_call(
|
|||||||
call_id: str,
|
call_id: str,
|
||||||
gcs_uri: str,
|
gcs_uri: str,
|
||||||
talkgroup_name: Optional[str] = None,
|
talkgroup_name: Optional[str] = None,
|
||||||
|
system_id: Optional[str] = None,
|
||||||
) -> tuple[Optional[str], list[dict]]:
|
) -> tuple[Optional[str], list[dict]]:
|
||||||
"""
|
"""
|
||||||
Transcribe audio at the given GCS URI and store the result in Firestore.
|
Transcribe audio at the given GCS URI and store the result in Firestore.
|
||||||
@@ -39,8 +40,17 @@ async def transcribe_call(
|
|||||||
if not gcs_uri or not gcs_uri.startswith("gs://"):
|
if not gcs_uri or not gcs_uri.startswith("gs://"):
|
||||||
return None, []
|
return None, []
|
||||||
|
|
||||||
|
# Load vocabulary for this system (empty list if none yet)
|
||||||
|
vocabulary: list[str] = []
|
||||||
|
if system_id:
|
||||||
|
from app.internal.vocabulary_learner import get_vocabulary
|
||||||
|
vocab_data = await get_vocabulary(system_id)
|
||||||
|
vocabulary = vocab_data.get("vocabulary") or []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
transcript, segments = await asyncio.to_thread(_sync_transcribe, gcs_uri, talkgroup_name)
|
transcript, segments = await asyncio.to_thread(
|
||||||
|
_sync_transcribe, gcs_uri, talkgroup_name, vocabulary
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Transcription failed for call {call_id}: {e}")
|
logger.warning(f"Transcription failed for call {call_id}: {e}")
|
||||||
return None, []
|
return None, []
|
||||||
@@ -61,7 +71,11 @@ async def transcribe_call(
|
|||||||
return transcript, segments
|
return transcript, segments
|
||||||
|
|
||||||
|
|
||||||
def _sync_transcribe(gcs_uri: str, talkgroup_name: Optional[str] = None) -> tuple[Optional[str], list[dict]]:
|
def _sync_transcribe(
|
||||||
|
gcs_uri: str,
|
||||||
|
talkgroup_name: Optional[str] = None,
|
||||||
|
vocabulary: Optional[list[str]] = None,
|
||||||
|
) -> tuple[Optional[str], list[dict]]:
|
||||||
"""Download audio from GCS and transcribe with OpenAI Whisper."""
|
"""Download audio from GCS and transcribe with OpenAI Whisper."""
|
||||||
from google.cloud import storage as gcs
|
from google.cloud import storage as gcs
|
||||||
from google.oauth2 import service_account
|
from google.oauth2 import service_account
|
||||||
@@ -94,7 +108,10 @@ def _sync_transcribe(gcs_uri: str, talkgroup_name: Optional[str] = None) -> tupl
|
|||||||
try:
|
try:
|
||||||
blob.download_to_filename(tmp_path)
|
blob.download_to_filename(tmp_path)
|
||||||
|
|
||||||
prompt = (f"Talkgroup: {talkgroup_name}. " + _WHISPER_PROMPT) if talkgroup_name else _WHISPER_PROMPT
|
from app.internal.vocabulary_learner import build_whisper_vocab_prompt
|
||||||
|
vocab_prefix = build_whisper_vocab_prompt(vocabulary or [])
|
||||||
|
tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
|
||||||
|
prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT
|
||||||
|
|
||||||
openai_client = OpenAI(api_key=settings.openai_api_key)
|
openai_client = OpenAI(api_key=settings.openai_api_key)
|
||||||
with open(tmp_path, "rb") as f:
|
with open(tmp_path, "rb") as f:
|
||||||
|
|||||||
@@ -0,0 +1,426 @@
|
|||||||
|
"""
|
||||||
|
Per-system vocabulary learning for STT accuracy improvement.
|
||||||
|
|
||||||
|
Three mechanisms:
|
||||||
|
1. Bootstrap — one-shot GPT-4o call generates local knowledge at system setup:
|
||||||
|
agencies + abbreviations, unit naming, streets, acronyms.
|
||||||
|
2. Correction — diffs admin transcript edits, extracts corrected tokens → vocabulary.
|
||||||
|
3. Induction — background loop samples N tokens of transcripts per system,
|
||||||
|
asks GPT-4o-mini to propose new terms → queued as pending for review.
|
||||||
|
|
||||||
|
Firestore schema additions on system documents:
|
||||||
|
vocabulary: list[str] — approved terms; injected into Whisper + GPT prompts
|
||||||
|
vocabulary_pending: list[dict] — induction suggestions awaiting admin review
|
||||||
|
each: {term, source, added_at}
|
||||||
|
vocabulary_bootstrapped: bool — bootstrap has been run at least once
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import difflib
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
from app.internal.logger import logger
|
||||||
|
from app.internal import firestore as fstore
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Prompt templates
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_BOOTSTRAP_PROMPT = """\
|
||||||
|
You are building a radio vocabulary dictionary to improve speech-to-text accuracy for a P25 \
|
||||||
|
public-safety radio monitoring system in a specific area. The STT model has no local knowledge, \
|
||||||
|
so common terms like "YVAC" get transcribed as "why vac", "5-baker" as "5 acre", etc.
|
||||||
|
|
||||||
|
System name: {system_name}
|
||||||
|
System type: {system_type}
|
||||||
|
Area context: {area_hint}
|
||||||
|
|
||||||
|
Return ONLY a JSON object:
|
||||||
|
{{"vocabulary": [list of strings]}}
|
||||||
|
|
||||||
|
Include terms you are confident about for this area:
|
||||||
|
- Agency names and their radio abbreviations (e.g. "YVAC" = Yorktown Volunteer Ambulance Corps)
|
||||||
|
- Unit ID examples using the local naming convention (e.g. "5-baker", "5-charlie", "1-david";
|
||||||
|
many departments use APCO phonetics: adam, baker, charles, david, edward, frank, george,
|
||||||
|
henry, ida, john, king, lincoln, mary, nora, ocean, paul, queen, robert, sam, tom, union,
|
||||||
|
victor, william, x-ray, young, zebra)
|
||||||
|
- Major routes, roads, and key intersections
|
||||||
|
- Local landmarks and geographic references dispatchers use
|
||||||
|
- Agency-specific codes that differ from standard APCO
|
||||||
|
|
||||||
|
Return a flat list of strings — abbreviations, proper names, unit IDs, street names.
|
||||||
|
Do NOT include common English words. Max 80 terms. Only include what you are confident is \
|
||||||
|
accurate for this specific area; return fewer terms rather than guessing."""
|
||||||
|
|
||||||
|
_INDUCTION_PROMPT = """\
|
||||||
|
You are analyzing P25 emergency radio transcripts to find vocabulary terms that should be \
|
||||||
|
added to improve future speech-to-text accuracy for this system.
|
||||||
|
|
||||||
|
System: {system_name}
|
||||||
|
Existing approved vocabulary (do not re-propose these): {existing_vocab}
|
||||||
|
|
||||||
|
Sampled transcripts:
|
||||||
|
{transcript_block}
|
||||||
|
|
||||||
|
Find terms that are LIKELY STT errors or local terms missing from the vocabulary:
|
||||||
|
- Unit IDs that appear garbled (e.g. "5 acre" → "5-baker")
|
||||||
|
- Agency acronyms spelled out phonetically (e.g. "why vac" → "YVAC")
|
||||||
|
- Street names or locations that look misspelled or oddly transcribed
|
||||||
|
- Callsigns or local codes not yet in the vocabulary
|
||||||
|
|
||||||
|
Return ONLY a JSON object:
|
||||||
|
{{"new_terms": ["term1", "term2", ...]}}
|
||||||
|
|
||||||
|
Only include high-confidence additions not already in existing vocabulary.
|
||||||
|
Return {{"new_terms": []}} if nothing new is found."""
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Public API
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def bootstrap_system_vocabulary(system_id: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
One-shot GPT-4o bootstrap: generate local-knowledge vocabulary for a system.
|
||||||
|
Merges generated terms into system.vocabulary and sets vocabulary_bootstrapped=True.
|
||||||
|
Returns the list of newly generated terms.
|
||||||
|
"""
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
logger.warning(f"Vocabulary bootstrap: system {system_id} not found")
|
||||||
|
return []
|
||||||
|
|
||||||
|
system_name = system_doc.get("name", "Unknown")
|
||||||
|
system_type = system_doc.get("type", "P25")
|
||||||
|
|
||||||
|
# Build area hint from configured talkgroup names
|
||||||
|
talkgroups = system_doc.get("config", {}).get("talkgroups", [])
|
||||||
|
tg_names = [tg.get("name", "") for tg in talkgroups if tg.get("name")][:8]
|
||||||
|
area_hint = f"Talkgroups include: {', '.join(tg_names)}" if tg_names else "Unknown area"
|
||||||
|
|
||||||
|
terms = await asyncio.to_thread(_sync_bootstrap, system_name, system_type, area_hint)
|
||||||
|
if not terms:
|
||||||
|
return []
|
||||||
|
|
||||||
|
existing = system_doc.get("vocabulary") or []
|
||||||
|
existing_lower = {t.lower() for t in existing}
|
||||||
|
to_add = [t for t in terms if t.lower() not in existing_lower]
|
||||||
|
merged = list(dict.fromkeys(existing + to_add))
|
||||||
|
|
||||||
|
await fstore.doc_set("systems", system_id, {
|
||||||
|
"vocabulary": merged,
|
||||||
|
"vocabulary_bootstrapped": True,
|
||||||
|
})
|
||||||
|
logger.info(
|
||||||
|
f"Vocabulary bootstrap: {len(to_add)} term(s) generated for system {system_id} "
|
||||||
|
f"({system_name})"
|
||||||
|
)
|
||||||
|
return to_add
|
||||||
|
|
||||||
|
|
||||||
|
async def learn_from_correction(system_id: str, original: str, corrected: str) -> None:
|
||||||
|
"""
|
||||||
|
Diff original and corrected transcripts; append new tokens to the approved vocabulary.
|
||||||
|
Called automatically when an admin saves a transcript correction.
|
||||||
|
"""
|
||||||
|
if not system_id or not original or not corrected:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_terms = _diff_new_terms(original, corrected)
|
||||||
|
if not new_terms:
|
||||||
|
return
|
||||||
|
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
return
|
||||||
|
|
||||||
|
existing = system_doc.get("vocabulary") or []
|
||||||
|
existing_lower = {t.lower() for t in existing}
|
||||||
|
to_add = [t for t in new_terms if t.lower() not in existing_lower]
|
||||||
|
if not to_add:
|
||||||
|
return
|
||||||
|
|
||||||
|
merged = list(dict.fromkeys(existing + to_add))
|
||||||
|
await fstore.doc_set("systems", system_id, {"vocabulary": merged})
|
||||||
|
logger.info(
|
||||||
|
f"Vocabulary: learned {len(to_add)} term(s) from correction on system {system_id}: "
|
||||||
|
f"{to_add}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def approve_pending_term(system_id: str, term: str) -> None:
|
||||||
|
"""Move a pending term into the approved vocabulary."""
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
return
|
||||||
|
pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
|
||||||
|
vocab = system_doc.get("vocabulary") or []
|
||||||
|
if term.lower() not in {t.lower() for t in vocab}:
|
||||||
|
vocab = list(dict.fromkeys(vocab + [term]))
|
||||||
|
await fstore.doc_set("systems", system_id, {
|
||||||
|
"vocabulary": vocab,
|
||||||
|
"vocabulary_pending": pending,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
async def dismiss_pending_term(system_id: str, term: str) -> None:
|
||||||
|
"""Remove a pending term without adding it to vocabulary."""
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
return
|
||||||
|
pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
|
||||||
|
await fstore.doc_set("systems", system_id, {"vocabulary_pending": pending})
|
||||||
|
|
||||||
|
|
||||||
|
async def add_term(system_id: str, term: str) -> None:
|
||||||
|
"""Manually add a term to the approved vocabulary."""
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
return
|
||||||
|
vocab = system_doc.get("vocabulary") or []
|
||||||
|
if term.lower() not in {t.lower() for t in vocab}:
|
||||||
|
vocab = list(dict.fromkeys(vocab + [term.strip()]))
|
||||||
|
await fstore.doc_set("systems", system_id, {"vocabulary": vocab})
|
||||||
|
|
||||||
|
|
||||||
|
async def remove_term(system_id: str, term: str) -> None:
|
||||||
|
"""Remove a term from the approved vocabulary."""
|
||||||
|
system_doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not system_doc:
|
||||||
|
return
|
||||||
|
vocab = [t for t in (system_doc.get("vocabulary") or []) if t.lower() != term.lower()]
|
||||||
|
await fstore.doc_set("systems", system_id, {"vocabulary": vocab})
|
||||||
|
|
||||||
|
|
||||||
|
async def get_vocabulary(system_id: str) -> dict:
|
||||||
|
"""Return vocabulary and pending terms for a system."""
|
||||||
|
doc = await fstore.doc_get("systems", system_id)
|
||||||
|
if not doc:
|
||||||
|
return {"vocabulary": [], "vocabulary_pending": [], "vocabulary_bootstrapped": False}
|
||||||
|
return {
|
||||||
|
"vocabulary": doc.get("vocabulary") or [],
|
||||||
|
"vocabulary_pending": doc.get("vocabulary_pending") or [],
|
||||||
|
"vocabulary_bootstrapped": doc.get("vocabulary_bootstrapped", False),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Prompt-injection helpers (called by transcription.py and intelligence.py)
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def build_whisper_vocab_prompt(vocabulary: list[str]) -> str:
|
||||||
|
"""
|
||||||
|
Format vocabulary for Whisper prompt injection.
|
||||||
|
Whisper's prompt field acts as a context prior with a ~224-token limit.
|
||||||
|
The base _WHISPER_PROMPT uses ~70 tokens; we budget ~150 tokens (≈550 chars) here.
|
||||||
|
"""
|
||||||
|
if not vocabulary:
|
||||||
|
return ""
|
||||||
|
char_budget = 550
|
||||||
|
terms: list[str] = []
|
||||||
|
used = 0
|
||||||
|
for term in vocabulary:
|
||||||
|
cost = len(term) + 2 # ", "
|
||||||
|
if used + cost > char_budget:
|
||||||
|
break
|
||||||
|
terms.append(term)
|
||||||
|
used += cost
|
||||||
|
return ", ".join(terms) + ". " if terms else ""
|
||||||
|
|
||||||
|
|
||||||
|
def build_gpt_vocab_block(vocabulary: list[str]) -> str:
|
||||||
|
"""Format vocabulary for injection into GPT extraction prompts."""
|
||||||
|
if not vocabulary:
|
||||||
|
return ""
|
||||||
|
return f"Known local terms: {', '.join(vocabulary)}\n"
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Background induction loop
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def vocabulary_induction_loop() -> None:
|
||||||
|
interval = settings.vocabulary_induction_interval_hours * 3600
|
||||||
|
logger.info(
|
||||||
|
f"Vocabulary induction loop started — "
|
||||||
|
f"interval: {settings.vocabulary_induction_interval_hours}h, "
|
||||||
|
f"sample budget: {settings.vocabulary_induction_sample_tokens} tokens"
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
try:
|
||||||
|
await _run_induction_pass()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Vocabulary induction pass failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_induction_pass() -> None:
|
||||||
|
systems = await fstore.collection_list("systems")
|
||||||
|
if not systems:
|
||||||
|
return
|
||||||
|
logger.info(f"Vocabulary induction: processing {len(systems)} system(s)")
|
||||||
|
for system in systems:
|
||||||
|
system_id = system.get("system_id")
|
||||||
|
if system_id:
|
||||||
|
try:
|
||||||
|
await _induct_system(system_id, system)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Induction failed for system {system_id}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def _induct_system(system_id: str, system_doc: dict) -> None:
|
||||||
|
"""Sample random transcripts for a system and propose new vocabulary."""
|
||||||
|
system_name = system_doc.get("name", "Unknown")
|
||||||
|
existing_vocab: list[str] = system_doc.get("vocabulary") or []
|
||||||
|
|
||||||
|
# Fetch recent ended calls for this system
|
||||||
|
all_calls = await fstore.collection_list("calls", system_id=system_id, status="ended")
|
||||||
|
if not all_calls:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Random sample up to the token budget (4 chars ≈ 1 token)
|
||||||
|
random.shuffle(all_calls)
|
||||||
|
char_budget = settings.vocabulary_induction_sample_tokens * 4
|
||||||
|
transcript_block = ""
|
||||||
|
sampled = 0
|
||||||
|
for call in all_calls:
|
||||||
|
text = call.get("transcript_corrected") or call.get("transcript") or ""
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if len(transcript_block) + len(text) > char_budget:
|
||||||
|
break
|
||||||
|
tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}"
|
||||||
|
transcript_block += f"[{tg}] {text}\n"
|
||||||
|
sampled += 1
|
||||||
|
|
||||||
|
if sampled < 3:
|
||||||
|
return # not enough data to learn from yet
|
||||||
|
|
||||||
|
new_terms = await asyncio.to_thread(
|
||||||
|
_sync_induct, system_name, existing_vocab, transcript_block
|
||||||
|
)
|
||||||
|
if not new_terms:
|
||||||
|
return
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
|
existing_pending: list[dict] = system_doc.get("vocabulary_pending") or []
|
||||||
|
pending_lower = {p["term"].lower() for p in existing_pending}
|
||||||
|
vocab_lower = {t.lower() for t in existing_vocab}
|
||||||
|
|
||||||
|
to_queue = [
|
||||||
|
{"term": t, "source": "induction", "added_at": now}
|
||||||
|
for t in new_terms
|
||||||
|
if t.lower() not in vocab_lower and t.lower() not in pending_lower
|
||||||
|
]
|
||||||
|
if not to_queue:
|
||||||
|
return
|
||||||
|
|
||||||
|
await fstore.doc_set("systems", system_id, {
|
||||||
|
"vocabulary_pending": existing_pending + to_queue,
|
||||||
|
})
|
||||||
|
logger.info(
|
||||||
|
f"Vocabulary induction: {len(to_queue)} new term(s) proposed for "
|
||||||
|
f"system {system_id} ({system_name}): {[p['term'] for p in to_queue]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Internal sync helpers
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_STOP_WORDS = {
|
||||||
|
"the", "and", "for", "are", "was", "were", "this", "that", "with",
|
||||||
|
"have", "has", "had", "but", "not", "from", "they", "will", "what",
|
||||||
|
"can", "all", "been", "one", "two", "three", "four", "five", "six",
|
||||||
|
"you", "out", "who", "get", "her", "him", "his", "its", "our", "my",
|
||||||
|
"via", "per", "any", "now", "got", "she", "let", "did", "may", "yes",
|
||||||
|
"sir", "say", "see", "too", "off", "how", "put", "set", "try", "back",
|
||||||
|
"just", "like", "into", "than", "them", "then", "some", "also", "onto",
|
||||||
|
"went", "over", "copy", "okay", "unit", "post", "road", "lane", "going",
|
||||||
|
"being", "doing", "there", "their", "about", "would", "could", "should",
|
||||||
|
"route", "north", "south", "east", "west", "avenue", "street", "drive",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _diff_new_terms(original: str, corrected: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Token-level diff: find tokens in `corrected` that replaced or were inserted
|
||||||
|
relative to `original`. These are the admin's intended spellings — good
|
||||||
|
candidates for vocabulary.
|
||||||
|
"""
|
||||||
|
orig_tokens = original.split()
|
||||||
|
corr_tokens = corrected.split()
|
||||||
|
|
||||||
|
matcher = difflib.SequenceMatcher(None,
|
||||||
|
[t.lower() for t in orig_tokens],
|
||||||
|
[t.lower() for t in corr_tokens],
|
||||||
|
)
|
||||||
|
new_terms: list[str] = []
|
||||||
|
for tag, _i1, _i2, j1, j2 in matcher.get_opcodes():
|
||||||
|
if tag in ("insert", "replace"):
|
||||||
|
for tok in corr_tokens[j1:j2]:
|
||||||
|
clean = tok.strip(".,!?;:()'\"").strip("-")
|
||||||
|
if len(clean) >= 3 and clean.lower() not in _STOP_WORDS:
|
||||||
|
new_terms.append(clean)
|
||||||
|
|
||||||
|
return list(dict.fromkeys(new_terms))
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_bootstrap(system_name: str, system_type: str, area_hint: str) -> list[str]:
|
||||||
|
from app.config import settings as cfg
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
if not cfg.openai_api_key:
|
||||||
|
return []
|
||||||
|
|
||||||
|
prompt = _BOOTSTRAP_PROMPT.format(
|
||||||
|
system_name=system_name,
|
||||||
|
system_type=system_type,
|
||||||
|
area_hint=area_hint,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
client = OpenAI(api_key=cfg.openai_api_key)
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
)
|
||||||
|
data = json.loads(response.choices[0].message.content)
|
||||||
|
terms = data.get("vocabulary") or []
|
||||||
|
return [str(t).strip() for t in terms if str(t).strip()]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Vocabulary bootstrap GPT call failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_induct(
|
||||||
|
system_name: str, existing_vocab: list[str], transcript_block: str
|
||||||
|
) -> list[str]:
|
||||||
|
from app.config import settings as cfg
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
if not cfg.openai_api_key:
|
||||||
|
return []
|
||||||
|
|
||||||
|
vocab_str = ", ".join(existing_vocab[:80]) if existing_vocab else "(none yet)"
|
||||||
|
prompt = _INDUCTION_PROMPT.format(
|
||||||
|
system_name=system_name,
|
||||||
|
existing_vocab=vocab_str,
|
||||||
|
transcript_block=transcript_block[:8000],
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
client = OpenAI(api_key=cfg.openai_api_key)
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
)
|
||||||
|
data = json.loads(response.choices[0].message.content)
|
||||||
|
terms = data.get("new_terms") or []
|
||||||
|
return [str(t).strip() for t in terms if str(t).strip()]
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Vocabulary induction GPT call failed: {e}")
|
||||||
|
return []
|
||||||
@@ -6,6 +6,8 @@ from app.internal.logger import logger
|
|||||||
from app.internal.mqtt_handler import mqtt_handler
|
from app.internal.mqtt_handler import mqtt_handler
|
||||||
from app.internal.node_sweeper import sweeper_loop
|
from app.internal.node_sweeper import sweeper_loop
|
||||||
from app.internal.summarizer import summarizer_loop
|
from app.internal.summarizer import summarizer_loop
|
||||||
|
from app.internal.vocabulary_learner import vocabulary_induction_loop
|
||||||
|
from app.internal.recorrelation_sweep import recorrelation_loop
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.internal.auth import require_firebase_token, require_service_or_firebase_token
|
from app.internal.auth import require_firebase_token, require_service_or_firebase_token
|
||||||
from app.routers import nodes, systems, calls, upload, tokens, incidents, alerts
|
from app.routers import nodes, systems, calls, upload, tokens, incidents, alerts
|
||||||
@@ -35,14 +37,18 @@ async def lifespan(app: FastAPI):
|
|||||||
await _release_orphaned_tokens()
|
await _release_orphaned_tokens()
|
||||||
|
|
||||||
await mqtt_handler.connect()
|
await mqtt_handler.connect()
|
||||||
sweeper_task = asyncio.create_task(sweeper_loop())
|
sweeper_task = asyncio.create_task(sweeper_loop())
|
||||||
summarizer_task = asyncio.create_task(summarizer_loop())
|
summarizer_task = asyncio.create_task(summarizer_loop())
|
||||||
|
induction_task = asyncio.create_task(vocabulary_induction_loop())
|
||||||
|
recorrelation_task = asyncio.create_task(recorrelation_loop())
|
||||||
|
|
||||||
yield # --- app running ---
|
yield # --- app running ---
|
||||||
|
|
||||||
logger.info("DRB C2 Core shutting down.")
|
logger.info("DRB C2 Core shutting down.")
|
||||||
sweeper_task.cancel()
|
sweeper_task.cancel()
|
||||||
summarizer_task.cancel()
|
summarizer_task.cancel()
|
||||||
|
induction_task.cancel()
|
||||||
|
recorrelation_task.cancel()
|
||||||
await mqtt_handler.disconnect()
|
await mqtt_handler.disconnect()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -83,6 +83,13 @@ async def patch_transcript(
|
|||||||
"embedding": None,
|
"embedding": None,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Learn from the correction: diff original → corrected and add new tokens to vocabulary
|
||||||
|
system_id = call.get("system_id")
|
||||||
|
original_text = call.get("transcript_corrected") or call.get("transcript") or ""
|
||||||
|
if system_id and original_text:
|
||||||
|
from app.internal.vocabulary_learner import learn_from_correction
|
||||||
|
await learn_from_correction(system_id, original_text, body.transcript)
|
||||||
|
|
||||||
from app.routers.upload import _run_extraction_pipeline
|
from app.routers.upload import _run_extraction_pipeline
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
_run_extraction_pipeline,
|
_run_extraction_pipeline,
|
||||||
|
|||||||
@@ -1,11 +1,17 @@
|
|||||||
import uuid
|
import uuid
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional
|
||||||
from app.models import SystemCreate, SystemRecord
|
from app.models import SystemCreate, SystemRecord
|
||||||
from app.internal import firestore as fstore
|
from app.internal import firestore as fstore
|
||||||
|
|
||||||
router = APIRouter(prefix="/systems", tags=["systems"])
|
router = APIRouter(prefix="/systems", tags=["systems"])
|
||||||
|
|
||||||
|
|
||||||
|
class VocabularyTermBody(BaseModel):
|
||||||
|
term: str
|
||||||
|
|
||||||
|
|
||||||
@router.get("")
|
@router.get("")
|
||||||
async def list_systems():
|
async def list_systems():
|
||||||
return await fstore.collection_list("systems")
|
return await fstore.collection_list("systems")
|
||||||
@@ -42,3 +48,70 @@ async def delete_system(system_id: str):
|
|||||||
if not existing:
|
if not existing:
|
||||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
await fstore.doc_delete("systems", system_id)
|
await fstore.doc_delete("systems", system_id)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Vocabulary endpoints ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.get("/{system_id}/vocabulary")
|
||||||
|
async def get_vocabulary(system_id: str):
|
||||||
|
"""Return approved vocabulary and pending induction suggestions."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import get_vocabulary as _get
|
||||||
|
return await _get(system_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{system_id}/vocabulary/bootstrap", status_code=202)
|
||||||
|
async def bootstrap_vocabulary(system_id: str):
|
||||||
|
"""Trigger a one-shot GPT-4o bootstrap to seed the vocabulary from local knowledge."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import bootstrap_system_vocabulary
|
||||||
|
terms = await bootstrap_system_vocabulary(system_id)
|
||||||
|
return {"added": len(terms), "terms": terms}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{system_id}/vocabulary/terms")
|
||||||
|
async def add_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||||
|
"""Manually add a term to the approved vocabulary."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import add_term
|
||||||
|
await add_term(system_id, body.term.strip())
|
||||||
|
return {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{system_id}/vocabulary/terms")
|
||||||
|
async def remove_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||||
|
"""Remove a term from the approved vocabulary."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import remove_term
|
||||||
|
await remove_term(system_id, body.term)
|
||||||
|
return {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{system_id}/vocabulary/pending/approve")
|
||||||
|
async def approve_pending(system_id: str, body: VocabularyTermBody):
|
||||||
|
"""Move a pending induction suggestion into the approved vocabulary."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import approve_pending_term
|
||||||
|
await approve_pending_term(system_id, body.term)
|
||||||
|
return {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{system_id}/vocabulary/pending/dismiss")
|
||||||
|
async def dismiss_pending(system_id: str, body: VocabularyTermBody):
|
||||||
|
"""Dismiss a pending induction suggestion without adding it."""
|
||||||
|
existing = await fstore.doc_get("systems", system_id)
|
||||||
|
if not existing:
|
||||||
|
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||||
|
from app.internal.vocabulary_learner import dismiss_pending_term
|
||||||
|
await dismiss_pending_term(system_id, body.term)
|
||||||
|
return {"ok": True}
|
||||||
|
|||||||
@@ -151,7 +151,9 @@ async def _run_intelligence_pipeline(
|
|||||||
|
|
||||||
# Step 1: Transcription
|
# Step 1: Transcription
|
||||||
if gcs_uri:
|
if gcs_uri:
|
||||||
transcript, segments = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name)
|
transcript, segments = await transcription.transcribe_call(
|
||||||
|
call_id, gcs_uri, talkgroup_name, system_id=system_id
|
||||||
|
)
|
||||||
|
|
||||||
# Step 2: Intelligence extraction
|
# Step 2: Intelligence extraction
|
||||||
tags: list[str] = []
|
tags: list[str] = []
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { useSystems } from "@/lib/useSystems";
|
import { useSystems } from "@/lib/useSystems";
|
||||||
import { c2api } from "@/lib/c2api";
|
import { c2api } from "@/lib/c2api";
|
||||||
import type { SystemRecord } from "@/lib/types";
|
import type { SystemRecord, VocabularyPendingTerm } from "@/lib/types";
|
||||||
|
|
||||||
// ── P25 structured config types ──────────────────────────────────────────────
|
// ── P25 structured config types ──────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -433,6 +433,189 @@ function SystemForm({
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Vocabulary panel ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function VocabularyPanel({ systemId }: { systemId: string }) {
|
||||||
|
const [vocab, setVocab] = useState<string[] | null>(null);
|
||||||
|
const [pending, setPending] = useState<VocabularyPendingTerm[]>([]);
|
||||||
|
const [bootstrapped, setBootstrapped] = useState(false);
|
||||||
|
const [loading, setLoading] = useState(false);
|
||||||
|
const [bootstrapping, setBootstrapping] = useState(false);
|
||||||
|
const [newTerm, setNewTerm] = useState("");
|
||||||
|
const [adding, setAdding] = useState(false);
|
||||||
|
const [open, setOpen] = useState(false);
|
||||||
|
|
||||||
|
async function load() {
|
||||||
|
if (vocab !== null) return; // already loaded
|
||||||
|
setLoading(true);
|
||||||
|
try {
|
||||||
|
const data = await c2api.getVocabulary(systemId);
|
||||||
|
setVocab(data.vocabulary);
|
||||||
|
setPending(data.vocabulary_pending);
|
||||||
|
setBootstrapped(data.vocabulary_bootstrapped);
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function toggle() {
|
||||||
|
if (!open) load();
|
||||||
|
setOpen((v) => !v);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleBootstrap() {
|
||||||
|
setBootstrapping(true);
|
||||||
|
try {
|
||||||
|
const result = await c2api.bootstrapVocabulary(systemId);
|
||||||
|
const data = await c2api.getVocabulary(systemId);
|
||||||
|
setVocab(data.vocabulary);
|
||||||
|
setPending(data.vocabulary_pending);
|
||||||
|
setBootstrapped(true);
|
||||||
|
alert(`Bootstrap added ${result.added} term(s).`);
|
||||||
|
} finally {
|
||||||
|
setBootstrapping(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleAdd(e: React.FormEvent) {
|
||||||
|
e.preventDefault();
|
||||||
|
const term = newTerm.trim();
|
||||||
|
if (!term) return;
|
||||||
|
setAdding(true);
|
||||||
|
try {
|
||||||
|
await c2api.addVocabularyTerm(systemId, term);
|
||||||
|
setVocab((v) => (v ? [...v, term] : [term]));
|
||||||
|
setNewTerm("");
|
||||||
|
} finally {
|
||||||
|
setAdding(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleRemove(term: string) {
|
||||||
|
await c2api.removeVocabularyTerm(systemId, term);
|
||||||
|
setVocab((v) => (v ?? []).filter((t) => t !== term));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleApprove(term: string) {
|
||||||
|
await c2api.approvePendingTerm(systemId, term);
|
||||||
|
setVocab((v) => (v ? [...v, term] : [term]));
|
||||||
|
setPending((p) => p.filter((t) => t.term !== term));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleDismiss(term: string) {
|
||||||
|
await c2api.dismissPendingTerm(systemId, term);
|
||||||
|
setPending((p) => p.filter((t) => t.term !== term));
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="mt-3 border-t border-gray-800 pt-3">
|
||||||
|
<button
|
||||||
|
onClick={toggle}
|
||||||
|
className="text-xs text-gray-500 hover:text-gray-300 font-mono transition-colors flex items-center gap-1"
|
||||||
|
>
|
||||||
|
<span>{open ? "▲" : "▼"}</span>
|
||||||
|
<span>
|
||||||
|
Vocabulary
|
||||||
|
{vocab !== null && <span className="text-gray-600 ml-1">({vocab.length} terms{pending.length > 0 ? `, ${pending.length} pending` : ""})</span>}
|
||||||
|
</span>
|
||||||
|
</button>
|
||||||
|
|
||||||
|
{open && (
|
||||||
|
<div className="mt-3 space-y-3 font-mono text-xs">
|
||||||
|
{loading && <p className="text-gray-600 italic">Loading…</p>}
|
||||||
|
|
||||||
|
{!loading && vocab !== null && (
|
||||||
|
<>
|
||||||
|
{/* Bootstrap button */}
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<button
|
||||||
|
onClick={handleBootstrap}
|
||||||
|
disabled={bootstrapping}
|
||||||
|
className="bg-indigo-800 hover:bg-indigo-700 disabled:opacity-50 text-indigo-200 px-3 py-1.5 rounded-lg text-xs transition-colors"
|
||||||
|
>
|
||||||
|
{bootstrapping ? "Bootstrapping…" : bootstrapped ? "Re-bootstrap with AI" : "Bootstrap with AI"}
|
||||||
|
</button>
|
||||||
|
<span className="text-gray-600">GPT-4o generates local knowledge (agencies, units, streets)</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Approved vocabulary chips */}
|
||||||
|
<div>
|
||||||
|
<p className="text-gray-500 uppercase tracking-wider mb-1.5">Approved ({vocab.length})</p>
|
||||||
|
{vocab.length > 0 ? (
|
||||||
|
<div className="flex flex-wrap gap-1.5">
|
||||||
|
{vocab.map((term) => (
|
||||||
|
<span
|
||||||
|
key={term}
|
||||||
|
className="inline-flex items-center gap-1 bg-gray-800 text-gray-300 px-2 py-0.5 rounded-full"
|
||||||
|
>
|
||||||
|
{term}
|
||||||
|
<button
|
||||||
|
onClick={() => handleRemove(term)}
|
||||||
|
className="text-gray-600 hover:text-red-400 transition-colors leading-none"
|
||||||
|
>
|
||||||
|
×
|
||||||
|
</button>
|
||||||
|
</span>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p className="text-gray-600 italic">No terms yet — bootstrap or add manually.</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Add term */}
|
||||||
|
<form onSubmit={handleAdd} className="flex gap-2">
|
||||||
|
<input
|
||||||
|
value={newTerm}
|
||||||
|
onChange={(e) => setNewTerm(e.target.value)}
|
||||||
|
placeholder="Add term (e.g. 5-baker, YVAC)"
|
||||||
|
className="flex-1 bg-gray-800 border border-gray-700 rounded-lg px-3 py-1.5 text-white text-xs font-mono focus:outline-none focus:border-indigo-500"
|
||||||
|
/>
|
||||||
|
<button
|
||||||
|
type="submit"
|
||||||
|
disabled={adding || !newTerm.trim()}
|
||||||
|
className="bg-gray-700 hover:bg-gray-600 disabled:opacity-50 text-gray-200 px-3 py-1.5 rounded-lg transition-colors"
|
||||||
|
>
|
||||||
|
Add
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
{/* Pending induction suggestions */}
|
||||||
|
{pending.length > 0 && (
|
||||||
|
<div>
|
||||||
|
<p className="text-gray-500 uppercase tracking-wider mb-1.5">
|
||||||
|
Induction suggestions ({pending.length})
|
||||||
|
</p>
|
||||||
|
<div className="space-y-1">
|
||||||
|
{pending.map((p) => (
|
||||||
|
<div key={p.term} className="flex items-center gap-2">
|
||||||
|
<span className="text-gray-300 flex-1">{p.term}</span>
|
||||||
|
<span className="text-gray-600">{p.source}</span>
|
||||||
|
<button
|
||||||
|
onClick={() => handleApprove(p.term)}
|
||||||
|
className="text-green-500 hover:text-green-400 transition-colors px-1"
|
||||||
|
>
|
||||||
|
✓
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => handleDismiss(p.term)}
|
||||||
|
className="text-gray-600 hover:text-red-400 transition-colors px-1"
|
||||||
|
>
|
||||||
|
✕
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// ── Systems list page ─────────────────────────────────────────────────────────
|
// ── Systems list page ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export default function SystemsPage() {
|
export default function SystemsPage() {
|
||||||
@@ -509,6 +692,7 @@ export default function SystemsPage() {
|
|||||||
Delete
|
Delete
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
<VocabularyPanel systemId={s.system_id} />
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
})}
|
})}
|
||||||
|
|||||||
@@ -93,4 +93,20 @@ export const c2api = {
|
|||||||
// Node key management
|
// Node key management
|
||||||
reissueNodeKey: (nodeId: string) =>
|
reissueNodeKey: (nodeId: string) =>
|
||||||
request(`/nodes/${nodeId}/reissue-key`, { method: "POST" }),
|
request(`/nodes/${nodeId}/reissue-key`, { method: "POST" }),
|
||||||
|
|
||||||
|
// Vocabulary
|
||||||
|
getVocabulary: (systemId: string) =>
|
||||||
|
request<{ vocabulary: string[]; vocabulary_pending: { term: string; source: string; added_at: string }[]; vocabulary_bootstrapped: boolean }>(
|
||||||
|
`/systems/${systemId}/vocabulary`
|
||||||
|
),
|
||||||
|
bootstrapVocabulary: (systemId: string) =>
|
||||||
|
request<{ added: number; terms: string[] }>(`/systems/${systemId}/vocabulary/bootstrap`, { method: "POST" }),
|
||||||
|
addVocabularyTerm: (systemId: string, term: string) =>
|
||||||
|
request(`/systems/${systemId}/vocabulary/terms`, { method: "POST", body: JSON.stringify({ term }) }),
|
||||||
|
removeVocabularyTerm: (systemId: string, term: string) =>
|
||||||
|
request(`/systems/${systemId}/vocabulary/terms`, { method: "DELETE", body: JSON.stringify({ term }) }),
|
||||||
|
approvePendingTerm: (systemId: string, term: string) =>
|
||||||
|
request(`/systems/${systemId}/vocabulary/pending/approve`, { method: "POST", body: JSON.stringify({ term }) }),
|
||||||
|
dismissPendingTerm: (systemId: string, term: string) =>
|
||||||
|
request(`/systems/${systemId}/vocabulary/pending/dismiss`, { method: "POST", body: JSON.stringify({ term }) }),
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -13,11 +13,20 @@ export interface NodeRecord {
|
|||||||
approval_status: ApprovalStatus | null;
|
approval_status: ApprovalStatus | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface VocabularyPendingTerm {
|
||||||
|
term: string;
|
||||||
|
source: "induction" | "correction";
|
||||||
|
added_at: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface SystemRecord {
|
export interface SystemRecord {
|
||||||
system_id: string;
|
system_id: string;
|
||||||
name: string;
|
name: string;
|
||||||
type: string; // P25 | DMR | NBFM
|
type: string; // P25 | DMR | NBFM
|
||||||
config: Record<string, unknown>;
|
config: Record<string, unknown>;
|
||||||
|
vocabulary?: string[];
|
||||||
|
vocabulary_pending?: VocabularyPendingTerm[];
|
||||||
|
vocabulary_bootstrapped?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TranscriptSegment {
|
export interface TranscriptSegment {
|
||||||
|
|||||||
Reference in New Issue
Block a user