From 9842b18799fc1f8d3f9586cec83ce2e9b75f5fc6 Mon Sep 17 00:00:00 2001 From: Logan Date: Wed, 3 Jun 2026 00:51:25 -0400 Subject: [PATCH] Fix correlation false-merge, switch STT to whisper-1 without vocab prompt - correlator: unit_overlap on dispatch channels now applies content divergence check when the call has geocoded coords but the incident doesn't; previously this gap caused unrelated calls to merge into stale incidents (e.g. patrol officer at a second scene 70 min later) - STT: switch default model from gpt-4o-transcribe to whisper-1, which faithfully transcribes all exchanges in multi-PTT recordings; gpt-4o was silently dropping utterances, starving the correlation engine - STT: remove vocabulary from the Whisper prompt; whisper-1 echoes prompted terms into noise/silence, skewing extracted incident data; vocabulary context is now applied exclusively in the GPT extraction step (build_gpt_vocab_block) where it is used as reference only --- drb-c2-core/app/config.py | 2 +- .../app/internal/incident_correlator.py | 12 ++++++++++ drb-c2-core/app/internal/transcription.py | 23 ++++++++----------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/drb-c2-core/app/config.py b/drb-c2-core/app/config.py index 74942c2..2720be1 100644 --- a/drb-c2-core/app/config.py +++ b/drb-c2-core/app/config.py @@ -19,7 +19,7 @@ class Settings(BaseSettings): # OpenAI (STT + intelligence) openai_api_key: Optional[str] = None - stt_model: str = "gpt-4o-transcribe" # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe + stt_model: str = "whisper-1" # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe # Google Maps (geocoding) google_maps_api_key: Optional[str] = None diff --git a/drb-c2-core/app/internal/incident_correlator.py b/drb-c2-core/app/internal/incident_correlator.py index d9b72c1..d184af8 100644 --- a/drb-c2-core/app/internal/incident_correlator.py +++ b/drb-c2-core/app/internal/incident_correlator.py @@ -1030,6 +1030,18 @@ def _call_fits_incident( if dist_km > proximity_km: logger.info(f" fits[{inc_id}]: unit_overlap({matched_units}) but location_conflict dist={dist_km:.2f}km → unit_loc_conflict") return False, "unit_loc_conflict" + elif call_embedding and idle_min >= 15: + # Call has geocode but incident doesn't — fall back to content + # divergence as a location proxy. Without this, stale incidents + # that never geocoded absorb unrelated calls purely on unit + # overlap (e.g. a patrol officer working a second scene 70 min + # after the original call). + inc_emb_u = inc.get("embedding") + if inc_emb_u: + sim = _cosine_similarity(call_embedding, inc_emb_u) + if sim < 0.82: + logger.info(f" fits[{inc_id}]: unit_overlap({matched_units}) but content_divergence (has_call_coords/no_inc_coords) sim={sim:.3f} → content_divergence") + return False, "content_divergence" elif call_embedding and idle_min >= 15: # No geocode available AND old incident: use content divergence as a # location-proxy veto. After 15+ minutes an officer at a completely diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py index 8af0db9..0076f45 100644 --- a/drb-c2-core/app/internal/transcription.py +++ b/drb-c2-core/app/internal/transcription.py @@ -40,16 +40,9 @@ async def transcribe_call( if not gcs_uri or not gcs_uri.startswith("gs://"): return None, [] - # Load vocabulary for this system (empty list if none yet) - vocabulary: list[str] = [] - if system_id: - from app.internal.vocabulary_learner import get_vocabulary - vocab_data = await get_vocabulary(system_id) - vocabulary = vocab_data.get("vocabulary") or [] - try: transcript, segments = await asyncio.to_thread( - _sync_transcribe, gcs_uri, talkgroup_name, vocabulary + _sync_transcribe, gcs_uri, talkgroup_name ) except Exception as e: logger.warning(f"Transcription failed for call {call_id}: {e}") @@ -74,7 +67,6 @@ async def transcribe_call( def _sync_transcribe( gcs_uri: str, talkgroup_name: Optional[str] = None, - vocabulary: Optional[list[str]] = None, ) -> tuple[Optional[str], list[dict]]: """Download audio from GCS and transcribe with OpenAI Whisper.""" from google.cloud import storage as gcs @@ -108,13 +100,16 @@ def _sync_transcribe( try: blob.download_to_filename(tmp_path) - from app.internal.vocabulary_learner import build_whisper_vocab_prompt - vocab_prefix = build_whisper_vocab_prompt(vocabulary or []) - tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else "" - prompt = tg_prefix + vocab_prefix + _WHISPER_PROMPT + tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else "" + # Vocabulary is intentionally excluded from the Whisper prompt. + # whisper-1 treats the prompt as a transcription prior and echoes + # vocabulary terms into noise/silence, polluting downstream extraction. + # Vocabulary context is applied in the GPT extraction step instead, + # where it is used as reference rather than a transcription prior. + prompt = tg_prefix + _WHISPER_PROMPT # Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob). - # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text. + # gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text. use_verbose = settings.stt_model == "whisper-1" openai_client = OpenAI(api_key=settings.openai_api_key)