From 92c9d8effc1984d99c1dac0b3748523268322db4 Mon Sep 17 00:00:00 2001 From: Logan Date: Sun, 24 May 2026 01:30:40 -0400 Subject: [PATCH] fix: garbage transcript detection, county geocoding, dispatch channel detection - intelligence.py: detect Whisper phonetic-alphabet hallucinations before sending to GPT; skip extraction entirely to prevent fake units/tags corrupting correlation - intelligence.py: upgrade node reverse-geocode from zoom=5 (state) to zoom=10 (county) and include county in address queries so common street names (e.g. "East Main Street") resolve to the correct county - incident_correlator.py: add "patched" and "primary" to dispatch channel regex so patched trunking channels are treated as shared backbones - incident_correlator.py: add 20-min idle gate for tactical channel default so a reused frequency can't absorb a new unrelated incident --- .../app/internal/incident_correlator.py | 15 ++- drb-c2-core/app/internal/intelligence.py | 98 +++++++++++++++---- 2 files changed, 93 insertions(+), 20 deletions(-) diff --git a/drb-c2-core/app/internal/incident_correlator.py b/drb-c2-core/app/internal/incident_correlator.py index 29974d5..1ba09be 100644 --- a/drb-c2-core/app/internal/incident_correlator.py +++ b/drb-c2-core/app/internal/incident_correlator.py @@ -46,7 +46,12 @@ from app.internal.logger import logger from app.internal import firestore as fstore from app.config import settings -_DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE) +_DISPATCH_TG_RE = re.compile( + r"\bdispatch\b|\bdisp\b" + r"|\bpatched\b" # patched channels aggregate multiple call streams + r"|\bprimary\b", # "Primary" channels serve as shared backbones + re.IGNORECASE, +) # Matches route/road identifiers in location strings for cross-system parent detection. # Groups: numbered routes (Route 202, NY-9, US-6, I-87, CR-35) and named parkways/highways. @@ -800,8 +805,12 @@ def _call_fits_incident( # Shared dispatch channel — do not link without at least one positive signal. return False - # Tactical channel: one scene per channel → link by default. - return True + # Tactical channel: one scene per channel. + # Within 20 min of the last incident activity, link by default — same + # working channel almost certainly means same scene. + # After 20 min of silence, require at least one positive signal; the same + # frequency can be reused for a new unrelated incident later in the shift. + return idle_min < 20.0 async def _update_incident( diff --git a/drb-c2-core/app/internal/intelligence.py b/drb-c2-core/app/internal/intelligence.py index f07bb72..956aaf1 100644 --- a/drb-c2-core/app/internal/intelligence.py +++ b/drb-c2-core/app/internal/intelligence.py @@ -54,8 +54,21 @@ Talkgroup: {talkgroup_name} # Nominatim viewbox half-width in degrees (~11 km at mid-latitudes) _GEO_DELTA = 0.5 # ~55 km bias radius; viewbox used as preference, not hard bound -# node_id → state abbreviation/name from one-time reverse geocode -_node_state_cache: dict[str, str] = {} +# node_id → {"county": str, "state": str} from one-time reverse geocode +_node_place_cache: dict[str, dict] = {} + +# Police/law-enforcement phonetic alphabet words (APCO + NATO). +# A run of 5+ of these in a transcript is a strong Whisper hallucination signal. +_PHONETIC_ALPHA_WORDS = frozenset({ + # APCO (law enforcement) + "adam", "baker", "charles", "david", "edward", "frank", "george", "henry", + "ida", "john", "king", "lincoln", "mary", "nora", "ocean", "paul", "queen", + "robert", "sam", "tom", "union", "victor", "william", "x-ray", "young", "zebra", + # NATO + "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", + "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", + "quebec", "romeo", "sierra", "tango", "uniform", "whiskey", "yankee", "zulu", +}) # Strip P25 service suffixes to extract the municipality name from a talkgroup _TG_SUFFIX_RE = re.compile( @@ -67,6 +80,35 @@ _TG_SUFFIX_RE = re.compile( ) +def _is_garbage_transcript(transcript: str) -> bool: + """ + Detect Whisper hallucinations that should be discarded before GPT processing. + + Two signals: + 1. Phonetic-alphabet run ≥ 5 consecutive words: Whisper hallucinated a + training-data sequence (common on silent or noise-only audio). + 2. High comma density (> 15% of tokens) in long transcripts: list-dump + hallucinations contain far more commas than real radio speech. + """ + words = re.findall(r"[\w\-]+", transcript.lower()) + if not words: + return False + + run = 0 + for w in words: + if w in _PHONETIC_ALPHA_WORDS: + run += 1 + if run >= 5: + return True + else: + run = 0 + + if len(words) > 30 and transcript.count(",") / len(words) > 0.15: + return True + + return False + + def _build_ten_codes_block(ten_codes: dict[str, str]) -> str: if not ten_codes: return "" @@ -107,6 +149,13 @@ async def extract_scenes( vocabulary = system_doc.get("vocabulary") or [] ten_codes = system_doc.get("ten_codes") or {} + if _is_garbage_transcript(transcript): + logger.warning( + f"Intelligence: call {call_id} — garbage transcript detected " + f"(Whisper hallucination), skipping extraction" + ) + return [] + raw_scenes: list[dict] = await asyncio.to_thread( _sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary, ten_codes, @@ -144,9 +193,14 @@ async def extract_scenes( # Geocode this scene's location location_coords: Optional[dict] = None if location and node_lat is not None and node_lon is not None: - state = await _get_node_state(node_id, node_lat, node_lon) - muni = _municipality_from_tg(talkgroup_name) - hint_parts = [p for p in [muni, state] if p] + place = await _get_node_place(node_id, node_lat, node_lon) + muni = _municipality_from_tg(talkgroup_name) + county = place.get("county", "") + state = place.get("state", "") + # Build hint from most specific to least: municipality → county → state. + # Including county prevents common street names (e.g. "East Main Street") + # from resolving to a wrong county when the address is ambiguous. + hint_parts = [p for p in [muni, county, state] if p] query = f"{location}, {', '.join(hint_parts)}" if hint_parts else location location_coords = await _geocode_location(query, node_lat, node_lon) @@ -252,33 +306,43 @@ async def _geocode_location( return None -async def _get_node_state(node_id: str, lat: float, lon: float) -> Optional[str]: +async def _get_node_place(node_id: str, lat: float, lon: float) -> dict: """ - Reverse geocode the node's position once to extract its state. + Reverse geocode the node's position once to extract county + state. + Uses zoom=10 so Nominatim returns county-level granularity, which is + included in geocoding queries to prevent common street names from resolving + to a wrong county (e.g. "East Main Street" in Orange vs. Westchester). Result is cached for the process lifetime — nodes don't move. + Returns dict with "county" and "state" keys (empty string if not found). """ - if node_id in _node_state_cache: - return _node_state_cache[node_id] + if node_id in _node_place_cache: + return _node_place_cache[node_id] import httpx headers = {"User-Agent": "DRB-Dispatch/1.0 (public-safety radio monitor)"} + place: dict = {"county": "", "state": ""} try: async with httpx.AsyncClient(timeout=5.0) as client: r = await client.get( "https://nominatim.openstreetmap.org/reverse", - params={"lat": lat, "lon": lon, "format": "json", "zoom": 5}, + params={"lat": lat, "lon": lon, "format": "json", "zoom": 10}, headers=headers, ) r.raise_for_status() data = r.json() - state = data.get("address", {}).get("state", "") - if state: - _node_state_cache[node_id] = state - logger.info(f"Node {node_id} reverse-geocoded to state: {state!r}") - return state + addr = data.get("address", {}) + place["county"] = addr.get("county", "") + place["state"] = addr.get("state", "") except Exception as e: - logger.warning(f"Node state reverse geocode failed: {e}") - return None + logger.warning(f"Node place reverse geocode failed: {e}") + + if place["county"] or place["state"]: + _node_place_cache[node_id] = place + logger.info( + f"Node {node_id} reverse-geocoded: county={place['county']!r}, " + f"state={place['state']!r}" + ) + return place def _municipality_from_tg(tg_name: Optional[str]) -> Optional[str]: