fix: garbage transcript detection, county geocoding, dispatch channel detection

- intelligence.py: detect Whisper phonetic-alphabet hallucinations before
  sending to GPT; skip extraction entirely to prevent fake units/tags
  corrupting correlation
- intelligence.py: upgrade node reverse-geocode from zoom=5 (state) to
  zoom=10 (county) and include county in address queries so common street
  names (e.g. "East Main Street") resolve to the correct county
- incident_correlator.py: add "patched" and "primary" to dispatch channel
  regex so patched trunking channels are treated as shared backbones
- incident_correlator.py: add 20-min idle gate for tactical channel default
  so a reused frequency can't absorb a new unrelated incident
This commit is contained in:
Logan
2026-05-24 01:30:40 -04:00
parent 1071bcd3e8
commit 92c9d8effc
2 changed files with 93 additions and 20 deletions
@@ -46,7 +46,12 @@ from app.internal.logger import logger
from app.internal import firestore as fstore
from app.config import settings
_DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE)
_DISPATCH_TG_RE = re.compile(
r"\bdispatch\b|\bdisp\b"
r"|\bpatched\b" # patched channels aggregate multiple call streams
r"|\bprimary\b", # "Primary" channels serve as shared backbones
re.IGNORECASE,
)
# Matches route/road identifiers in location strings for cross-system parent detection.
# Groups: numbered routes (Route 202, NY-9, US-6, I-87, CR-35) and named parkways/highways.
@@ -800,8 +805,12 @@ def _call_fits_incident(
# Shared dispatch channel — do not link without at least one positive signal.
return False
# Tactical channel: one scene per channel → link by default.
return True
# Tactical channel: one scene per channel.
# Within 20 min of the last incident activity, link by default — same
# working channel almost certainly means same scene.
# After 20 min of silence, require at least one positive signal; the same
# frequency can be reused for a new unrelated incident later in the shift.
return idle_min < 20.0
async def _update_incident(
+81 -17
View File
@@ -54,8 +54,21 @@ Talkgroup: {talkgroup_name}
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
_GEO_DELTA = 0.5 # ~55 km bias radius; viewbox used as preference, not hard bound
# node_id → state abbreviation/name from one-time reverse geocode
_node_state_cache: dict[str, str] = {}
# node_id → {"county": str, "state": str} from one-time reverse geocode
_node_place_cache: dict[str, dict] = {}
# Police/law-enforcement phonetic alphabet words (APCO + NATO).
# A run of 5+ of these in a transcript is a strong Whisper hallucination signal.
_PHONETIC_ALPHA_WORDS = frozenset({
# APCO (law enforcement)
"adam", "baker", "charles", "david", "edward", "frank", "george", "henry",
"ida", "john", "king", "lincoln", "mary", "nora", "ocean", "paul", "queen",
"robert", "sam", "tom", "union", "victor", "william", "x-ray", "young", "zebra",
# NATO
"alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel",
"india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa",
"quebec", "romeo", "sierra", "tango", "uniform", "whiskey", "yankee", "zulu",
})
# Strip P25 service suffixes to extract the municipality name from a talkgroup
_TG_SUFFIX_RE = re.compile(
@@ -67,6 +80,35 @@ _TG_SUFFIX_RE = re.compile(
)
def _is_garbage_transcript(transcript: str) -> bool:
"""
Detect Whisper hallucinations that should be discarded before GPT processing.
Two signals:
1. Phonetic-alphabet run ≥ 5 consecutive words: Whisper hallucinated a
training-data sequence (common on silent or noise-only audio).
2. High comma density (> 15% of tokens) in long transcripts: list-dump
hallucinations contain far more commas than real radio speech.
"""
words = re.findall(r"[\w\-]+", transcript.lower())
if not words:
return False
run = 0
for w in words:
if w in _PHONETIC_ALPHA_WORDS:
run += 1
if run >= 5:
return True
else:
run = 0
if len(words) > 30 and transcript.count(",") / len(words) > 0.15:
return True
return False
def _build_ten_codes_block(ten_codes: dict[str, str]) -> str:
if not ten_codes:
return ""
@@ -107,6 +149,13 @@ async def extract_scenes(
vocabulary = system_doc.get("vocabulary") or []
ten_codes = system_doc.get("ten_codes") or {}
if _is_garbage_transcript(transcript):
logger.warning(
f"Intelligence: call {call_id} — garbage transcript detected "
f"(Whisper hallucination), skipping extraction"
)
return []
raw_scenes: list[dict] = await asyncio.to_thread(
_sync_extract,
transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary, ten_codes,
@@ -144,9 +193,14 @@ async def extract_scenes(
# Geocode this scene's location
location_coords: Optional[dict] = None
if location and node_lat is not None and node_lon is not None:
state = await _get_node_state(node_id, node_lat, node_lon)
muni = _municipality_from_tg(talkgroup_name)
hint_parts = [p for p in [muni, state] if p]
place = await _get_node_place(node_id, node_lat, node_lon)
muni = _municipality_from_tg(talkgroup_name)
county = place.get("county", "")
state = place.get("state", "")
# Build hint from most specific to least: municipality → county → state.
# Including county prevents common street names (e.g. "East Main Street")
# from resolving to a wrong county when the address is ambiguous.
hint_parts = [p for p in [muni, county, state] if p]
query = f"{location}, {', '.join(hint_parts)}" if hint_parts else location
location_coords = await _geocode_location(query, node_lat, node_lon)
@@ -252,33 +306,43 @@ async def _geocode_location(
return None
async def _get_node_state(node_id: str, lat: float, lon: float) -> Optional[str]:
async def _get_node_place(node_id: str, lat: float, lon: float) -> dict:
"""
Reverse geocode the node's position once to extract its state.
Reverse geocode the node's position once to extract county + state.
Uses zoom=10 so Nominatim returns county-level granularity, which is
included in geocoding queries to prevent common street names from resolving
to a wrong county (e.g. "East Main Street" in Orange vs. Westchester).
Result is cached for the process lifetime — nodes don't move.
Returns dict with "county" and "state" keys (empty string if not found).
"""
if node_id in _node_state_cache:
return _node_state_cache[node_id]
if node_id in _node_place_cache:
return _node_place_cache[node_id]
import httpx
headers = {"User-Agent": "DRB-Dispatch/1.0 (public-safety radio monitor)"}
place: dict = {"county": "", "state": ""}
try:
async with httpx.AsyncClient(timeout=5.0) as client:
r = await client.get(
"https://nominatim.openstreetmap.org/reverse",
params={"lat": lat, "lon": lon, "format": "json", "zoom": 5},
params={"lat": lat, "lon": lon, "format": "json", "zoom": 10},
headers=headers,
)
r.raise_for_status()
data = r.json()
state = data.get("address", {}).get("state", "")
if state:
_node_state_cache[node_id] = state
logger.info(f"Node {node_id} reverse-geocoded to state: {state!r}")
return state
addr = data.get("address", {})
place["county"] = addr.get("county", "")
place["state"] = addr.get("state", "")
except Exception as e:
logger.warning(f"Node state reverse geocode failed: {e}")
return None
logger.warning(f"Node place reverse geocode failed: {e}")
if place["county"] or place["state"]:
_node_place_cache[node_id] = place
logger.info(
f"Node {node_id} reverse-geocoded: county={place['county']!r}, "
f"state={place['state']!r}"
)
return place
def _municipality_from_tg(tg_name: Optional[str]) -> Optional[str]: