fix: garbage transcript detection, county geocoding, dispatch channel detection
- intelligence.py: detect Whisper phonetic-alphabet hallucinations before sending to GPT; skip extraction entirely to prevent fake units/tags corrupting correlation - intelligence.py: upgrade node reverse-geocode from zoom=5 (state) to zoom=10 (county) and include county in address queries so common street names (e.g. "East Main Street") resolve to the correct county - incident_correlator.py: add "patched" and "primary" to dispatch channel regex so patched trunking channels are treated as shared backbones - incident_correlator.py: add 20-min idle gate for tactical channel default so a reused frequency can't absorb a new unrelated incident
This commit is contained in:
@@ -54,8 +54,21 @@ Talkgroup: {talkgroup_name}
|
||||
# Nominatim viewbox half-width in degrees (~11 km at mid-latitudes)
|
||||
_GEO_DELTA = 0.5 # ~55 km bias radius; viewbox used as preference, not hard bound
|
||||
|
||||
# node_id → state abbreviation/name from one-time reverse geocode
|
||||
_node_state_cache: dict[str, str] = {}
|
||||
# node_id → {"county": str, "state": str} from one-time reverse geocode
|
||||
_node_place_cache: dict[str, dict] = {}
|
||||
|
||||
# Police/law-enforcement phonetic alphabet words (APCO + NATO).
|
||||
# A run of 5+ of these in a transcript is a strong Whisper hallucination signal.
|
||||
_PHONETIC_ALPHA_WORDS = frozenset({
|
||||
# APCO (law enforcement)
|
||||
"adam", "baker", "charles", "david", "edward", "frank", "george", "henry",
|
||||
"ida", "john", "king", "lincoln", "mary", "nora", "ocean", "paul", "queen",
|
||||
"robert", "sam", "tom", "union", "victor", "william", "x-ray", "young", "zebra",
|
||||
# NATO
|
||||
"alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel",
|
||||
"india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa",
|
||||
"quebec", "romeo", "sierra", "tango", "uniform", "whiskey", "yankee", "zulu",
|
||||
})
|
||||
|
||||
# Strip P25 service suffixes to extract the municipality name from a talkgroup
|
||||
_TG_SUFFIX_RE = re.compile(
|
||||
@@ -67,6 +80,35 @@ _TG_SUFFIX_RE = re.compile(
|
||||
)
|
||||
|
||||
|
||||
def _is_garbage_transcript(transcript: str) -> bool:
|
||||
"""
|
||||
Detect Whisper hallucinations that should be discarded before GPT processing.
|
||||
|
||||
Two signals:
|
||||
1. Phonetic-alphabet run ≥ 5 consecutive words: Whisper hallucinated a
|
||||
training-data sequence (common on silent or noise-only audio).
|
||||
2. High comma density (> 15% of tokens) in long transcripts: list-dump
|
||||
hallucinations contain far more commas than real radio speech.
|
||||
"""
|
||||
words = re.findall(r"[\w\-]+", transcript.lower())
|
||||
if not words:
|
||||
return False
|
||||
|
||||
run = 0
|
||||
for w in words:
|
||||
if w in _PHONETIC_ALPHA_WORDS:
|
||||
run += 1
|
||||
if run >= 5:
|
||||
return True
|
||||
else:
|
||||
run = 0
|
||||
|
||||
if len(words) > 30 and transcript.count(",") / len(words) > 0.15:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _build_ten_codes_block(ten_codes: dict[str, str]) -> str:
|
||||
if not ten_codes:
|
||||
return ""
|
||||
@@ -107,6 +149,13 @@ async def extract_scenes(
|
||||
vocabulary = system_doc.get("vocabulary") or []
|
||||
ten_codes = system_doc.get("ten_codes") or {}
|
||||
|
||||
if _is_garbage_transcript(transcript):
|
||||
logger.warning(
|
||||
f"Intelligence: call {call_id} — garbage transcript detected "
|
||||
f"(Whisper hallucination), skipping extraction"
|
||||
)
|
||||
return []
|
||||
|
||||
raw_scenes: list[dict] = await asyncio.to_thread(
|
||||
_sync_extract,
|
||||
transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary, ten_codes,
|
||||
@@ -144,9 +193,14 @@ async def extract_scenes(
|
||||
# Geocode this scene's location
|
||||
location_coords: Optional[dict] = None
|
||||
if location and node_lat is not None and node_lon is not None:
|
||||
state = await _get_node_state(node_id, node_lat, node_lon)
|
||||
muni = _municipality_from_tg(talkgroup_name)
|
||||
hint_parts = [p for p in [muni, state] if p]
|
||||
place = await _get_node_place(node_id, node_lat, node_lon)
|
||||
muni = _municipality_from_tg(talkgroup_name)
|
||||
county = place.get("county", "")
|
||||
state = place.get("state", "")
|
||||
# Build hint from most specific to least: municipality → county → state.
|
||||
# Including county prevents common street names (e.g. "East Main Street")
|
||||
# from resolving to a wrong county when the address is ambiguous.
|
||||
hint_parts = [p for p in [muni, county, state] if p]
|
||||
query = f"{location}, {', '.join(hint_parts)}" if hint_parts else location
|
||||
location_coords = await _geocode_location(query, node_lat, node_lon)
|
||||
|
||||
@@ -252,33 +306,43 @@ async def _geocode_location(
|
||||
return None
|
||||
|
||||
|
||||
async def _get_node_state(node_id: str, lat: float, lon: float) -> Optional[str]:
|
||||
async def _get_node_place(node_id: str, lat: float, lon: float) -> dict:
|
||||
"""
|
||||
Reverse geocode the node's position once to extract its state.
|
||||
Reverse geocode the node's position once to extract county + state.
|
||||
Uses zoom=10 so Nominatim returns county-level granularity, which is
|
||||
included in geocoding queries to prevent common street names from resolving
|
||||
to a wrong county (e.g. "East Main Street" in Orange vs. Westchester).
|
||||
Result is cached for the process lifetime — nodes don't move.
|
||||
Returns dict with "county" and "state" keys (empty string if not found).
|
||||
"""
|
||||
if node_id in _node_state_cache:
|
||||
return _node_state_cache[node_id]
|
||||
if node_id in _node_place_cache:
|
||||
return _node_place_cache[node_id]
|
||||
|
||||
import httpx
|
||||
headers = {"User-Agent": "DRB-Dispatch/1.0 (public-safety radio monitor)"}
|
||||
place: dict = {"county": "", "state": ""}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
r = await client.get(
|
||||
"https://nominatim.openstreetmap.org/reverse",
|
||||
params={"lat": lat, "lon": lon, "format": "json", "zoom": 5},
|
||||
params={"lat": lat, "lon": lon, "format": "json", "zoom": 10},
|
||||
headers=headers,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
state = data.get("address", {}).get("state", "")
|
||||
if state:
|
||||
_node_state_cache[node_id] = state
|
||||
logger.info(f"Node {node_id} reverse-geocoded to state: {state!r}")
|
||||
return state
|
||||
addr = data.get("address", {})
|
||||
place["county"] = addr.get("county", "")
|
||||
place["state"] = addr.get("state", "")
|
||||
except Exception as e:
|
||||
logger.warning(f"Node state reverse geocode failed: {e}")
|
||||
return None
|
||||
logger.warning(f"Node place reverse geocode failed: {e}")
|
||||
|
||||
if place["county"] or place["state"]:
|
||||
_node_place_cache[node_id] = place
|
||||
logger.info(
|
||||
f"Node {node_id} reverse-geocoded: county={place['county']!r}, "
|
||||
f"state={place['state']!r}"
|
||||
)
|
||||
return place
|
||||
|
||||
|
||||
def _municipality_from_tg(tg_name: Optional[str]) -> Optional[str]:
|
||||
|
||||
Reference in New Issue
Block a user