fix: garbage transcript detection, county geocoding, dispatch channel detection

- intelligence.py: detect Whisper phonetic-alphabet hallucinations before sending to GPT; skip extraction entirely to prevent fake units/tags corrupting correlation - intelligence.py: upgrade node reverse-geocode from zoom=5 (state) to zoom=10 (county) and include county in address queries so common street names (e.g. "East Main Street") resolve to the correct county - incident_correlator.py: add "patched" and "primary" to dispatch channel regex so patched trunking channels are treated as shared backbones - incident_correlator.py: add 20-min idle gate for tactical channel default so a reused frequency can't absorb a new unrelated incident
2026-05-24 01:30:40 -04:00
parent 1071bcd3e8
commit 92c9d8effc
2 changed files with 93 additions and 20 deletions
@@ -46,7 +46,12 @@ from app.internal.logger import logger
 from app.internal import firestore as fstore
 from app.config import settings

-_DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE)
+_DISPATCH_TG_RE = re.compile(
+    r"\bdispatch\b|\bdisp\b"
+    r"|\bpatched\b"   # patched channels aggregate multiple call streams
+    r"|\bprimary\b",  # "Primary" channels serve as shared backbones
+    re.IGNORECASE,
+)

 # Matches route/road identifiers in location strings for cross-system parent detection.
 # Groups: numbered routes (Route 202, NY-9, US-6, I-87, CR-35) and named parkways/highways.
@@ -800,8 +805,12 @@ def _call_fits_incident(
        # Shared dispatch channel — do not link without at least one positive signal.
        return False

-    # Tactical channel: one scene per channel → link by default.
-    return True
+    # Tactical channel: one scene per channel.
+    # Within 20 min of the last incident activity, link by default — same
+    # working channel almost certainly means same scene.
+    # After 20 min of silence, require at least one positive signal; the same
+    # frequency can be reused for a new unrelated incident later in the shift.
+    return idle_min < 20.0


 async def _update_incident(
@@ -54,8 +54,21 @@ Talkgroup: {talkgroup_name}
 # Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
 _GEO_DELTA = 0.5  # ~55 km bias radius; viewbox used as preference, not hard bound

-# node_id → state abbreviation/name from one-time reverse geocode
-_node_state_cache: dict[str, str] = {}
+# node_id → {"county": str, "state": str} from one-time reverse geocode
+_node_place_cache: dict[str, dict] = {}
+
+# Police/law-enforcement phonetic alphabet words (APCO + NATO).
+# A run of 5+ of these in a transcript is a strong Whisper hallucination signal.
+_PHONETIC_ALPHA_WORDS = frozenset({
+    # APCO (law enforcement)
+    "adam", "baker", "charles", "david", "edward", "frank", "george", "henry",
+    "ida", "john", "king", "lincoln", "mary", "nora", "ocean", "paul", "queen",
+    "robert", "sam", "tom", "union", "victor", "william", "x-ray", "young", "zebra",
+    # NATO
+    "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel",
+    "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa",
+    "quebec", "romeo", "sierra", "tango", "uniform", "whiskey", "yankee", "zulu",
+})

 # Strip P25 service suffixes to extract the municipality name from a talkgroup
 _TG_SUFFIX_RE = re.compile(
@@ -67,6 +80,35 @@ _TG_SUFFIX_RE = re.compile(
 )


+def _is_garbage_transcript(transcript: str) -> bool:
+    """
+    Detect Whisper hallucinations that should be discarded before GPT processing.
+
+    Two signals:
+    1. Phonetic-alphabet run ≥ 5 consecutive words: Whisper hallucinated a
+       training-data sequence (common on silent or noise-only audio).
+    2. High comma density (> 15% of tokens) in long transcripts: list-dump
+       hallucinations contain far more commas than real radio speech.
+    """
+    words = re.findall(r"[\w\-]+", transcript.lower())
+    if not words:
+        return False
+
+    run = 0
+    for w in words:
+        if w in _PHONETIC_ALPHA_WORDS:
+            run += 1
+            if run >= 5:
+                return True
+        else:
+            run = 0
+
+    if len(words) > 30 and transcript.count(",") / len(words) > 0.15:
+        return True
+
+    return False
+
+
 def _build_ten_codes_block(ten_codes: dict[str, str]) -> str:
    if not ten_codes:
        return ""
@@ -107,6 +149,13 @@ async def extract_scenes(
            vocabulary = system_doc.get("vocabulary") or []
            ten_codes  = system_doc.get("ten_codes") or {}

+    if _is_garbage_transcript(transcript):
+        logger.warning(
+            f"Intelligence: call {call_id} — garbage transcript detected "
+            f"(Whisper hallucination), skipping extraction"
+        )
+        return []
+
    raw_scenes: list[dict] = await asyncio.to_thread(
        _sync_extract,
        transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary, ten_codes,
@@ -144,9 +193,14 @@ async def extract_scenes(
        # Geocode this scene's location
        location_coords: Optional[dict] = None
        if location and node_lat is not None and node_lon is not None:
-            state = await _get_node_state(node_id, node_lat, node_lon)
-            muni  = _municipality_from_tg(talkgroup_name)
-            hint_parts = [p for p in [muni, state] if p]
+            place  = await _get_node_place(node_id, node_lat, node_lon)
+            muni   = _municipality_from_tg(talkgroup_name)
+            county = place.get("county", "")
+            state  = place.get("state", "")
+            # Build hint from most specific to least: municipality → county → state.
+            # Including county prevents common street names (e.g. "East Main Street")
+            # from resolving to a wrong county when the address is ambiguous.
+            hint_parts = [p for p in [muni, county, state] if p]
            query = f"{location}, {', '.join(hint_parts)}" if hint_parts else location
            location_coords = await _geocode_location(query, node_lat, node_lon)

@@ -252,33 +306,43 @@ async def _geocode_location(
    return None


-async def _get_node_state(node_id: str, lat: float, lon: float) -> Optional[str]:
+async def _get_node_place(node_id: str, lat: float, lon: float) -> dict:
    """
-    Reverse geocode the node's position once to extract its state.
+    Reverse geocode the node's position once to extract county + state.
+    Uses zoom=10 so Nominatim returns county-level granularity, which is
+    included in geocoding queries to prevent common street names from resolving
+    to a wrong county (e.g. "East Main Street" in Orange vs. Westchester).
    Result is cached for the process lifetime — nodes don't move.
+    Returns dict with "county" and "state" keys (empty string if not found).
    """
-    if node_id in _node_state_cache:
-        return _node_state_cache[node_id]
+    if node_id in _node_place_cache:
+        return _node_place_cache[node_id]

    import httpx
    headers = {"User-Agent": "DRB-Dispatch/1.0 (public-safety radio monitor)"}
+    place: dict = {"county": "", "state": ""}
    try:
        async with httpx.AsyncClient(timeout=5.0) as client:
            r = await client.get(
                "https://nominatim.openstreetmap.org/reverse",
-                params={"lat": lat, "lon": lon, "format": "json", "zoom": 5},
+                params={"lat": lat, "lon": lon, "format": "json", "zoom": 10},
                headers=headers,
            )
            r.raise_for_status()
            data = r.json()
-            state = data.get("address", {}).get("state", "")
-            if state:
-                _node_state_cache[node_id] = state
-                logger.info(f"Node {node_id} reverse-geocoded to state: {state!r}")
-                return state
+            addr = data.get("address", {})
+            place["county"] = addr.get("county", "")
+            place["state"]  = addr.get("state", "")
    except Exception as e:
-        logger.warning(f"Node state reverse geocode failed: {e}")
-    return None
+        logger.warning(f"Node place reverse geocode failed: {e}")
+
+    if place["county"] or place["state"]:
+        _node_place_cache[node_id] = place
+        logger.info(
+            f"Node {node_id} reverse-geocoded: county={place['county']!r}, "
+            f"state={place['state']!r}"
+        )
+    return place


 def _municipality_from_tg(tg_name: Optional[str]) -> Optional[str]: