Fix correlation over-merge, thin-call hallucination, and geocoding accuracy
- Cap unit-continuity path at 20 min idle (unit_continuity_max_idle_minutes) - Block time_fallback and unit-continuity matching on reassignment calls - Expand reassignment detection to cover unit-initiated self-reassignment - Skip GPT extraction entirely for transcripts ≤5 words (prevents hallucinated tags/units) - Reduce geocode_max_km from 75 to 40 to reject far-out-of-area results - Include county in geocoding query for tighter jurisdiction anchoring
This commit is contained in:
@@ -33,13 +33,13 @@ Response format — a JSON object with a "scenes" array. Each scene:
|
||||
cleared_units: list of unit IDs that explicitly signal back-in-service or available in this recording
|
||||
severity: one of "minor" | "moderate" | "major" | "unknown"
|
||||
resolved: true if this scene explicitly signals incident closure, false otherwise
|
||||
reassignment: true if dispatch is actively pulling a unit away from their current assignment to respond to a new, different call — e.g. "Baker, can you clear and respond to...", "Adam, break from that and go to...". False if the unit is simply reporting in, updating status, or continuing their current assignment.
|
||||
reassignment: true if a unit is breaking from their current scene to respond to a completely different call — whether dispatch-initiated ("Baker, can you clear and respond to...", "Adam, break from that and go to...") OR unit-initiated ("Show me headed to the vehicle complaint", "Can you show me to that call", a unit going 10-8 and self-requesting a new assignment). False if the unit is reporting in on their current scene, giving a status update, or requesting information about their existing call.
|
||||
transcript_corrected: corrected text for this scene's transmissions only, or null
|
||||
|
||||
Rules:
|
||||
- location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none.
|
||||
- tags: describe WHAT happened, not WHERE. Specific, lowercase, hyphenated. Do not use location names, road names, talkgroup names, or place names as tags (wrong: "lower-macy's", "canvas-route-6", "route-202"; right: "suspect-search", "shoplifting", "vehicle-pursuit"). Do not repeat incident_type as a tag.
|
||||
- units: only identifiers explicitly mentioned, not inferred.
|
||||
- units: ONLY identifiers that appear verbatim in the transcript. If the word or number is not literally present in the text above, do not include it. Never infer or guess unit IDs.
|
||||
- Do not invent details not present in the transcript.
|
||||
- incident_type: let the talkgroup channel be your primary signal. Use "fire" ONLY if the talkgroup is clearly a fire/rescue channel OR the transcript explicitly describes active fire, smoke, flames, or structure fire activation. Police or EMS referencing a fire scene → use "police" or "ems". When uncertain, prefer "other" over "fire".
|
||||
- ten_codes: interpret radio codes using the department reference provided below. Do not guess codes not listed.
|
||||
@@ -55,8 +55,9 @@ Talkgroup: {talkgroup_name}
|
||||
# Geographic bias radius for geocoding — half-width in degrees (~55 km)
|
||||
_GEO_DELTA = 0.5
|
||||
|
||||
# Cache node state (e.g. "New York") so we only reverse-geocode once per node
|
||||
_node_state_cache: dict[str, str] = {}
|
||||
# Cache node state (e.g. "New York") and county (e.g. "Westchester County") per node
|
||||
_node_state_cache: dict[str, str] = {}
|
||||
_node_county_cache: dict[str, str] = {}
|
||||
|
||||
# Police/law-enforcement phonetic alphabet words (APCO + NATO).
|
||||
# A run of 5+ of these in a transcript is a strong Whisper hallucination signal.
|
||||
@@ -163,6 +164,15 @@ async def extract_scenes(
|
||||
pass
|
||||
return []
|
||||
|
||||
# Transcripts with ≤5 words carry no extractable intelligence — GPT hallucinates
|
||||
# units and tags from thin context (e.g. "Main Lot", "10-4", "David").
|
||||
if len(transcript.split()) <= 5:
|
||||
logger.info(
|
||||
f"Intelligence: call {call_id} — transcript too short for extraction "
|
||||
f"({len(transcript.split())} words), skipping"
|
||||
)
|
||||
return []
|
||||
|
||||
raw_scenes: list[dict] = await asyncio.to_thread(
|
||||
_sync_extract,
|
||||
transcript, talkgroup_name, talkgroup_id, system_id, segments, vocabulary, ten_codes,
|
||||
@@ -203,11 +213,14 @@ async def extract_scenes(
|
||||
# This prevents generic street names from resolving to wrong-country results.
|
||||
location_coords: Optional[dict] = None
|
||||
if location and node_lat is not None and node_lon is not None:
|
||||
muni = _municipality_from_tg(talkgroup_name)
|
||||
state = await _get_node_state(node_id or "", node_lat, node_lon) if node_id else ""
|
||||
parts = [location]
|
||||
muni = _municipality_from_tg(talkgroup_name)
|
||||
state = await _get_node_state(node_id or "", node_lat, node_lon) if node_id else ""
|
||||
county = _node_county_cache.get(node_id or "") if node_id else ""
|
||||
parts = [location]
|
||||
if muni:
|
||||
parts.append(muni)
|
||||
if county:
|
||||
parts.append(county)
|
||||
if state:
|
||||
parts.append(state)
|
||||
query = ", ".join(parts)
|
||||
@@ -288,6 +301,7 @@ def _geo_dist_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
|
||||
async def _get_node_state(node_id: str, lat: float, lon: float) -> str:
|
||||
"""
|
||||
Return the US state name (e.g. "New York") for a node's position.
|
||||
Also populates _node_county_cache as a side-effect (same API call).
|
||||
Uses Google Maps Reverse Geocoding; cached for the process lifetime since nodes don't move.
|
||||
"""
|
||||
if node_id in _node_state_cache:
|
||||
@@ -300,29 +314,36 @@ async def _get_node_state(node_id: str, lat: float, lon: float) -> str:
|
||||
return ""
|
||||
|
||||
state = ""
|
||||
county = ""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
r = await client.get(
|
||||
"https://maps.googleapis.com/maps/api/geocode/json",
|
||||
params={
|
||||
"latlng": f"{lat},{lon}",
|
||||
"result_type": "administrative_area_level_1",
|
||||
"key": settings.google_maps_api_key,
|
||||
"latlng": f"{lat},{lon}",
|
||||
"result_type": "administrative_area_level_1|administrative_area_level_2",
|
||||
"key": settings.google_maps_api_key,
|
||||
},
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
if data.get("status") == "OK" and data.get("results"):
|
||||
for comp in data["results"][0].get("address_components", []):
|
||||
if "administrative_area_level_1" in comp.get("types", []):
|
||||
state = comp.get("long_name", "")
|
||||
break
|
||||
for result in data["results"]:
|
||||
for comp in result.get("address_components", []):
|
||||
types = comp.get("types", [])
|
||||
if "administrative_area_level_1" in types and not state:
|
||||
state = comp.get("long_name", "")
|
||||
if "administrative_area_level_2" in types and not county:
|
||||
county = comp.get("long_name", "")
|
||||
except Exception as e:
|
||||
logger.warning(f"Node state lookup failed for {node_id}: {e}")
|
||||
|
||||
if state:
|
||||
_node_state_cache[node_id] = state
|
||||
logger.info(f"Node {node_id} state resolved: {state!r}")
|
||||
if county:
|
||||
_node_county_cache[node_id] = county
|
||||
if state or county:
|
||||
logger.info(f"Node {node_id} geo resolved: county={county!r} state={state!r}")
|
||||
return state
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user