From 9842b18799fc1f8d3f9586cec83ce2e9b75f5fc6 Mon Sep 17 00:00:00 2001
From: Logan <Logan@simplestepsolutions.com>
Date: Wed, 3 Jun 2026 00:51:25 -0400
Subject: [PATCH] Fix correlation false-merge, switch STT to whisper-1 without
 vocab prompt

- correlator: unit_overlap on dispatch channels now applies content
  divergence check when the call has geocoded coords but the incident
  doesn't; previously this gap caused unrelated calls to merge into
  stale incidents (e.g. patrol officer at a second scene 70 min later)
- STT: switch default model from gpt-4o-transcribe to whisper-1, which
  faithfully transcribes all exchanges in multi-PTT recordings; gpt-4o
  was silently dropping utterances, starving the correlation engine
- STT: remove vocabulary from the Whisper prompt; whisper-1 echoes
  prompted terms into noise/silence, skewing extracted incident data;
  vocabulary context is now applied exclusively in the GPT extraction
  step (build_gpt_vocab_block) where it is used as reference only
---
 drb-c2-core/app/config.py                     |  2 +-
 .../app/internal/incident_correlator.py       | 12 ++++++++++
 drb-c2-core/app/internal/transcription.py     | 23 ++++++++-----------
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/drb-c2-core/app/config.py b/drb-c2-core/app/config.py
index 74942c2..2720be1 100644
--- a/drb-c2-core/app/config.py
+++ b/drb-c2-core/app/config.py
@@ -19,7 +19,7 @@ class Settings(BaseSettings):
 
     # OpenAI (STT + intelligence)
     openai_api_key: Optional[str] = None
-    stt_model: str = "gpt-4o-transcribe"  # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe
+    stt_model: str = "whisper-1"  # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe
 
     # Google Maps (geocoding)
     google_maps_api_key: Optional[str] = None
diff --git a/drb-c2-core/app/internal/incident_correlator.py b/drb-c2-core/app/internal/incident_correlator.py
index d9b72c1..d184af8 100644
--- a/drb-c2-core/app/internal/incident_correlator.py
+++ b/drb-c2-core/app/internal/incident_correlator.py
@@ -1030,6 +1030,18 @@ def _call_fits_incident(
                     if dist_km > proximity_km:
                         logger.info(f"  fits[{inc_id}]: unit_overlap({matched_units}) but location_conflict dist={dist_km:.2f}km → unit_loc_conflict")
                         return False, "unit_loc_conflict"
+                elif call_embedding and idle_min >= 15:
+                    # Call has geocode but incident doesn't — fall back to content
+                    # divergence as a location proxy.  Without this, stale incidents
+                    # that never geocoded absorb unrelated calls purely on unit
+                    # overlap (e.g. a patrol officer working a second scene 70 min
+                    # after the original call).
+                    inc_emb_u = inc.get("embedding")
+                    if inc_emb_u:
+                        sim = _cosine_similarity(call_embedding, inc_emb_u)
+                        if sim < 0.82:
+                            logger.info(f"  fits[{inc_id}]: unit_overlap({matched_units}) but content_divergence (has_call_coords/no_inc_coords) sim={sim:.3f} → content_divergence")
+                            return False, "content_divergence"
             elif call_embedding and idle_min >= 15:
                 # No geocode available AND old incident: use content divergence as a
                 # location-proxy veto.  After 15+ minutes an officer at a completely
diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py
index 8af0db9..0076f45 100644
--- a/drb-c2-core/app/internal/transcription.py
+++ b/drb-c2-core/app/internal/transcription.py
@@ -40,16 +40,9 @@ async def transcribe_call(
     if not gcs_uri or not gcs_uri.startswith("gs://"):
         return None, []
 
-    # Load vocabulary for this system (empty list if none yet)
-    vocabulary: list[str] = []
-    if system_id:
-        from app.internal.vocabulary_learner import get_vocabulary
-        vocab_data = await get_vocabulary(system_id)
-        vocabulary = vocab_data.get("vocabulary") or []
-
     try:
         transcript, segments = await asyncio.to_thread(
-            _sync_transcribe, gcs_uri, talkgroup_name, vocabulary
+            _sync_transcribe, gcs_uri, talkgroup_name
         )
     except Exception as e:
         logger.warning(f"Transcription failed for call {call_id}: {e}")
@@ -74,7 +67,6 @@ async def transcribe_call(
 def _sync_transcribe(
     gcs_uri: str,
     talkgroup_name: Optional[str] = None,
-    vocabulary: Optional[list[str]] = None,
 ) -> tuple[Optional[str], list[dict]]:
     """Download audio from GCS and transcribe with OpenAI Whisper."""
     from google.cloud import storage as gcs
@@ -108,13 +100,16 @@ def _sync_transcribe(
     try:
         blob.download_to_filename(tmp_path)
 
-        from app.internal.vocabulary_learner import build_whisper_vocab_prompt
-        vocab_prefix = build_whisper_vocab_prompt(vocabulary or [])
-        tg_prefix    = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
-        prompt       = tg_prefix + vocab_prefix + _WHISPER_PROMPT
+        tg_prefix = f"Talkgroup: {talkgroup_name}. " if talkgroup_name else ""
+        # Vocabulary is intentionally excluded from the Whisper prompt.
+        # whisper-1 treats the prompt as a transcription prior and echoes
+        # vocabulary terms into noise/silence, polluting downstream extraction.
+        # Vocabulary context is applied in the GPT extraction step instead,
+        # where it is used as reference rather than a transcription prior.
+        prompt = tg_prefix + _WHISPER_PROMPT
 
         # Only whisper-1 supports verbose_json (per-segment timestamps + no_speech_prob).
-        # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe) only accept json/text.
+        # gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text.
         use_verbose = settings.stt_model == "whisper-1"
 
         openai_client = OpenAI(api_key=settings.openai_api_key)