Compare commits

...

3 Commits

Author SHA1 Message Date
Logan 97f4286810 Add debugging 2026-05-04 01:46:56 -04:00
Logan e704df1a62 # app/internal/incident_correlator.py
- *`correlate_call`* — added units and vehicles optional params; when provided (per-scene from intelligence extraction), they take priority over the merged call-document values, preventing multi-scene unit contamination
- *Cross-TGID correlation path (2.5)* — *new path between location and slow paths*: when a call shares 2+ unit IDs with a recent same-system, same-type incident AND embedding similarity ≥ 0.85, it links them — catches multi-talkgroup pursuits like the bicycle search that split across dispatch/tactical/geographic channels
# `app/internal/intelligence.py`
- *`reassignment` field* — added to the GPT-4o-mini prompt schema and rules; `true` when dispatch is actively pulling a unit to a new, different call (not a status update or en route acknowledgement); returned in every processed scene dict
- *Tag location rule* — added explicit instruction to the prompt: tags must describe what happened, not where; place names, road names, and talkgroup names are explicitly forbidden as tags
# `app/routers/upload.py`
- Both scene correlation call sites (`_run_extraction_pipeline` and `_run_intelligence_pipeline`) now pass `units=corr_units` where `corr_units = [] if scene.get("reassignment") else scene.get("units") `— suppresses unit overlap matching when a unit is being reassigned to a new call, preventing chaining into their previous incident
- Both sites also pass `vehicles=scene.get("vehicles")` (per-scene vehicles, from the multi-scene units fix)
# `app/config.py`
- `embedding_cross_tg_threshold: float = 0.85` — threshold for the new cross-TGID path
2026-05-04 01:33:03 -04:00
Logan f6897566f8 Fix tags, titles, and hallucinations 2026-05-04 01:13:18 -04:00
7 changed files with 145 additions and 10 deletions
+1
View File
@@ -26,6 +26,7 @@ class Settings(BaseSettings):
correlation_window_hours: int = 2 # slow/location path: max hours since last call correlation_window_hours: int = 2 # slow/location path: max hours since last call
embedding_similarity_threshold: float = 0.93 # slow-path: requires location corroboration embedding_similarity_threshold: float = 0.93 # slow-path: requires location corroboration
embedding_no_location_threshold: float = 0.97 # slow-path: match without location (very high bar) embedding_no_location_threshold: float = 0.97 # slow-path: match without location (very high bar)
embedding_cross_tg_threshold: float = 0.85 # cross-TG path: same dept + 2+ shared units
location_proximity_km: float = 0.5 # radius for location-proximity matching location_proximity_km: float = 0.5 # radius for location-proximity matching
incident_auto_resolve_minutes: int = 90 # auto-resolve after N minutes with no new calls incident_auto_resolve_minutes: int = 90 # auto-resolve after N minutes with no new calls
recorrelation_scan_minutes: int = 60 # re-examine orphaned calls ended within this window recorrelation_scan_minutes: int = 60 # re-examine orphaned calls ended within this window
@@ -49,6 +49,14 @@ from app.config import settings
_DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE) _DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE)
def _tag_to_title(tag: str) -> str:
"""
Convert a hyphenated tag to title case without the str.title() apostrophe bug.
e.g. "lower-macy's""Lower Macy's" (not "Lower Macy'S")
"""
return " ".join(w.capitalize() for w in tag.replace("-", " ").split())
def _is_dispatch_channel(talkgroup_name: Optional[str]) -> bool: def _is_dispatch_channel(talkgroup_name: Optional[str]) -> bool:
"""True when the talkgroup is a shared dispatch backbone (not a tactical/working channel).""" """True when the talkgroup is a shared dispatch backbone (not a tactical/working channel)."""
if not talkgroup_name: if not talkgroup_name:
@@ -84,6 +92,8 @@ async def correlate_call(
location_coords: Optional[dict] = None, location_coords: Optional[dict] = None,
reference_time: Optional[datetime] = None, reference_time: Optional[datetime] = None,
create_if_new: bool = True, create_if_new: bool = True,
units: Optional[list[str]] = None,
vehicles: Optional[list[str]] = None,
) -> Optional[str]: ) -> Optional[str]:
""" """
Link call_id to an existing incident or create a new one. Link call_id to an existing incident or create a new one.
@@ -107,13 +117,18 @@ async def correlate_call(
# Fetch call doc once — reused for disambiguation, embedding merge, unit accumulation # Fetch call doc once — reused for disambiguation, embedding merge, unit accumulation
call_doc = await fstore.doc_get("calls", call_id) or {} call_doc = await fstore.doc_get("calls", call_id) or {}
call_embedding: Optional[list] = call_doc.get("embedding") call_embedding: Optional[list] = call_doc.get("embedding")
call_units: list[str] = call_doc.get("units") or [] # Prefer explicitly passed units/vehicles (per-scene, from intelligence extraction)
call_vehicles: list[str] = call_doc.get("vehicles") or [] # over the call doc, which merges units from ALL scenes in a multi-scene recording.
# Falling back to the call doc is correct for recorrelation sweeps where we have no
# scene-level breakdown.
call_units: list[str] = units if units is not None else (call_doc.get("units") or [])
call_vehicles: list[str] = vehicles if vehicles is not None else (call_doc.get("vehicles") or [])
call_severity: str = call_doc.get("severity") or "unknown" call_severity: str = call_doc.get("severity") or "unknown"
# Use passed coords first (freshly geocoded), fall back to what's on the call doc # Use passed coords first (freshly geocoded), fall back to what's on the call doc
coords: Optional[dict] = location_coords or call_doc.get("location_coords") coords: Optional[dict] = location_coords or call_doc.get("location_coords")
matched_incident: Optional[dict] = None matched_incident: Optional[dict] = None
corr_debug: dict = {}
# A "thin" call carries no scene-identifying information — it is a pure # A "thin" call carries no scene-identifying information — it is a pure
# status transmission (10-4, en route, acknowledgement). Detected by the # status transmission (10-4, en route, acknowledgement). Detected by the
@@ -156,6 +171,10 @@ async def correlate_call(
# Status/ack call — no scene data to reason about. # Status/ack call — no scene data to reason about.
# Attach to whichever recent incident was most recently active on this TGID. # Attach to whichever recent incident was most recently active on this TGID.
matched_incident = max(tg_recent, key=lambda inc: inc.get("updated_at", "")) matched_incident = max(tg_recent, key=lambda inc: inc.get("updated_at", ""))
corr_debug = {
"corr_path": "fast/thin",
"corr_incident_idle_min": round(_incident_idle_minutes(matched_incident, now), 1),
}
logger.info( logger.info(
f"Correlator fast-path (thin→last TGID incident): " f"Correlator fast-path (thin→last TGID incident): "
f"call {call_id}{matched_incident['incident_id']}" f"call {call_id}{matched_incident['incident_id']}"
@@ -167,6 +186,10 @@ async def correlate_call(
settings.location_proximity_km, is_dispatch=is_dispatch, settings.location_proximity_km, is_dispatch=is_dispatch,
): ):
matched_incident = candidate matched_incident = candidate
corr_debug = {
"corr_path": "fast/single",
"corr_incident_idle_min": round(_incident_idle_minutes(candidate, now), 1),
}
logger.info( logger.info(
f"Correlator fast-path: call {call_id}{candidate['incident_id']}" f"Correlator fast-path: call {call_id}{candidate['incident_id']}"
) )
@@ -179,6 +202,11 @@ async def correlate_call(
matched_incident = _disambiguate( matched_incident = _disambiguate(
tg_recent, call_units, call_vehicles, coords, call_embedding tg_recent, call_units, call_vehicles, coords, call_embedding
) )
corr_debug = {
"corr_path": "fast/disambig",
"corr_incident_idle_min": round(_incident_idle_minutes(matched_incident, now), 1),
"corr_candidates": len(tg_recent),
}
logger.info( logger.info(
f"Correlator fast-path (disambig {len(tg_recent)} candidates): " f"Correlator fast-path (disambig {len(tg_recent)} candidates): "
f"call {call_id}{matched_incident['incident_id']}" f"call {call_id}{matched_incident['incident_id']}"
@@ -196,12 +224,54 @@ async def correlate_call(
) )
if dist_km <= settings.location_proximity_km: if dist_km <= settings.location_proximity_km:
matched_incident = inc matched_incident = inc
corr_debug = {"corr_path": "location", "corr_distance_km": round(dist_km, 3)}
logger.info( logger.info(
f"Correlator location-path: call {call_id}{inc['incident_id']} " f"Correlator location-path: call {call_id}{inc['incident_id']} "
f"(dist={dist_km:.2f}km)" f"(dist={dist_km:.2f}km)"
) )
break break
# ── 2.5. Cross-TG path: same department, overlapping units, moderate similarity ──
#
# Catches pursuits / searches that span multiple talkgroup IDs within the same
# department (e.g. dispatch → tactical → geographic channel). The fast path
# is TGID-scoped so it never links these. Two conditions together provide
# strong evidence of the same scene without needing location:
# • 2+ shared unit IDs (same officers working the same call)
# • embedding similarity >= cross-TG threshold (same subject matter)
# Requiring 2+ shared units prevents single-officer false positives.
if not matched_incident and call_embedding and incident_type and call_units and system_id:
call_unit_set = set(call_units)
best_cross_score = 0.0
best_cross_inc: Optional[dict] = None
for inc in recent:
if inc.get("type") != incident_type:
continue
if system_id not in (inc.get("system_ids") or []):
continue
inc_units_set = set(inc.get("units") or [])
if len(call_unit_set & inc_units_set) < 2:
continue
inc_embedding = inc.get("embedding")
if not inc_embedding:
continue
sim = _cosine_similarity(call_embedding, inc_embedding)
if sim > best_cross_score:
best_cross_score = sim
best_cross_inc = inc
if best_cross_inc and best_cross_score >= settings.embedding_cross_tg_threshold:
matched_incident = best_cross_inc
shared = len(call_unit_set & set(best_cross_inc.get("units") or []))
corr_debug = {
"corr_path": "cross-tg",
"corr_score": round(best_cross_score, 4),
"corr_shared_units": shared,
}
logger.info(
f"Correlator cross-TG path: call {call_id}{best_cross_inc['incident_id']} "
f"(sim={best_cross_score:.3f}, shared_units={shared})"
)
# ── 3. Slow path: embedding similarity (time-limited, same type) ────────── # ── 3. Slow path: embedding similarity (time-limited, same type) ──────────
# #
# Two tiers: # Two tiers:
@@ -232,6 +302,11 @@ async def correlate_call(
) )
if dist_km <= settings.location_proximity_km * 4: if dist_km <= settings.location_proximity_km * 4:
matched_incident = best_inc matched_incident = best_inc
corr_debug = {
"corr_path": "slow",
"corr_score": round(best_score, 4),
"corr_distance_km": round(dist_km, 3),
}
logger.info( logger.info(
f"Correlator slow-path: call {call_id}{best_inc['incident_id']} " f"Correlator slow-path: call {call_id}{best_inc['incident_id']} "
f"(sim={best_score:.3f}, dist={dist_km:.2f}km)" f"(sim={best_score:.3f}, dist={dist_km:.2f}km)"
@@ -240,6 +315,10 @@ async def correlate_call(
# High-confidence semantic match; geocode unavailable on one or # High-confidence semantic match; geocode unavailable on one or
# both sides — content similarity alone is sufficient evidence. # both sides — content similarity alone is sufficient evidence.
matched_incident = best_inc matched_incident = best_inc
corr_debug = {
"corr_path": "slow/no-location",
"corr_score": round(best_score, 4),
}
logger.info( logger.info(
f"Correlator slow-path (high-confidence, no location): " f"Correlator slow-path (high-confidence, no location): "
f"call {call_id}{best_inc['incident_id']} (sim={best_score:.3f})" f"call {call_id}{best_inc['incident_id']} (sim={best_score:.3f})"
@@ -259,10 +338,19 @@ async def correlate_call(
tags, location, location_coords, tags, location, location_coords,
call_units, call_vehicles, call_embedding, call_severity, now, call_units, call_vehicles, call_embedding, call_severity, now,
) )
corr_debug["corr_path"] = "new"
else: else:
# No match and either no type or creation suppressed — nothing to do # No match and either no type or creation suppressed — nothing to do
return None return None
# Persist the correlation decision to the call document so it can be
# inspected in Firestore or the admin UI without log-scraping.
if corr_debug:
try:
await fstore.doc_set("calls", call_id, corr_debug)
except Exception as e:
logger.warning(f"Could not write corr_debug for call {call_id}: {e}")
return incident_id return incident_id
@@ -474,12 +562,12 @@ async def _update_incident(
# Routine status calls (type=None) do not clobber the title. # Routine status calls (type=None) do not clobber the title.
if incident_type: if incident_type:
content_tags = [t for t in tags if t != "auto-generated"] content_tags = [t for t in tags if t != "auto-generated"]
primary_tag = content_tags[0].replace("-", " ").title() if content_tags else None primary_tag = _tag_to_title(content_tags[0]) if content_tags else None
tg_label = ( tg_label = (
talkgroup_name talkgroup_name
or (f"TGID {talkgroup_id}" if talkgroup_id else inc.get("title", "").split("")[-1]) or (f"TGID {talkgroup_id}" if talkgroup_id else inc.get("title", "").split("")[-1])
) )
if primary_tag and best_location: if primary_tag and best_location and primary_tag.lower() != best_location.lower():
updates["title"] = f"{primary_tag} at {best_location}" updates["title"] = f"{primary_tag} at {best_location}"
elif primary_tag and tg_label: elif primary_tag and tg_label:
updates["title"] = f"{primary_tag}{tg_label}" updates["title"] = f"{primary_tag}{tg_label}"
@@ -513,13 +601,13 @@ async def _create_incident(
# Build a descriptive title from tags + location when available # Build a descriptive title from tags + location when available
content_tags = [t for t in tags if t != "auto-generated"] content_tags = [t for t in tags if t != "auto-generated"]
primary_tag = content_tags[0].replace("-", " ").title() if content_tags else None primary_tag = _tag_to_title(content_tags[0]) if content_tags else None
if primary_tag and location: if primary_tag and location and primary_tag.lower() != location.lower():
title = f"{primary_tag} at {location}" title = f"{primary_tag} at {location}"
elif primary_tag: elif primary_tag:
title = f"{primary_tag}{tg_label}" title = f"{primary_tag}{tg_label}"
else: else:
title = f"{incident_type.title()}{tg_label}" title = f"{_tag_to_title(incident_type)}{tg_label}"
doc = { doc = {
"incident_id": incident_id, "incident_id": incident_id,
+5 -1
View File
@@ -31,16 +31,18 @@ Response format — a JSON object with a "scenes" array. Each scene:
units: list of unit IDs or officer numbers explicitly mentioned units: list of unit IDs or officer numbers explicitly mentioned
severity: one of "minor" | "moderate" | "major" | "unknown" severity: one of "minor" | "moderate" | "major" | "unknown"
resolved: true if this scene explicitly signals incident closure, false otherwise resolved: true if this scene explicitly signals incident closure, false otherwise
reassignment: true if dispatch is actively pulling a unit away from their current assignment to respond to a new, different call — e.g. "Baker, can you clear and respond to...", "Adam, break from that and go to...". False if the unit is simply reporting in, updating status, or continuing their current assignment.
transcript_corrected: corrected text for this scene's transmissions only, or null transcript_corrected: corrected text for this scene's transmissions only, or null
Rules: Rules:
- location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none. - location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none.
- tags: specific, lowercase, hyphenated. Do not repeat incident_type as a tag. - tags: describe WHAT happened, not WHERE. Specific, lowercase, hyphenated. Do not use location names, road names, talkgroup names, or place names as tags (wrong: "lower-macy's", "canvas-route-6", "route-202"; right: "suspect-search", "shoplifting", "vehicle-pursuit"). Do not repeat incident_type as a tag.
- units: only identifiers explicitly mentioned, not inferred. - units: only identifiers explicitly mentioned, not inferred.
- Do not invent details not present in the transcript. - Do not invent details not present in the transcript.
- incident_type: let the talkgroup channel be your primary signal. Use "fire" ONLY if the talkgroup is clearly a fire/rescue channel OR the transcript explicitly describes active fire, smoke, flames, or structure fire activation. Police or EMS referencing a fire scene → use "police" or "ems". When uncertain, prefer "other" over "fire". - incident_type: let the talkgroup channel be your primary signal. Use "fire" ONLY if the talkgroup is clearly a fire/rescue channel OR the transcript explicitly describes active fire, smoke, flames, or structure fire activation. Police or EMS referencing a fire scene → use "police" or "ems". When uncertain, prefer "other" over "fire".
- ten_codes: interpret radio codes using the department reference provided below. Do not guess codes not listed. - ten_codes: interpret radio codes using the department reference provided below. Do not guess codes not listed.
- resolved: true only when the scene explicitly signals "Code 4", "all clear", "10-42", "in custody", "patient transported", "fire out", "GOA", "negative contact", "scene clear". - resolved: true only when the scene explicitly signals "Code 4", "all clear", "10-42", "in custody", "patient transported", "fire out", "GOA", "negative contact", "scene clear".
- reassignment: only true when a unit is explicitly being pulled to a completely new call or location. A unit going en route to their first dispatch is NOT a reassignment. Routine status updates, acknowledgements, and scene updates are NOT reassignments.
- transcript_corrected: fix only clear STT/vocoder errors (e.g. "Several""10-4", misheard street names, garbled unit IDs). Keep all radio language as-is — do NOT decode codes into plain English. Return null if accurate. - transcript_corrected: fix only clear STT/vocoder errors (e.g. "Several""10-4", misheard street names, garbled unit IDs). Keep all radio language as-is — do NOT decode codes into plain English. Return null if accurate.
System: {system_id} System: {system_id}
@@ -130,6 +132,7 @@ async def extract_scenes(
units: list[str] = scene.get("units") or [] units: list[str] = scene.get("units") or []
severity: str = scene.get("severity") or "unknown" severity: str = scene.get("severity") or "unknown"
resolved: bool = bool(scene.get("resolved", False)) resolved: bool = bool(scene.get("resolved", False))
reassignment: bool = bool(scene.get("reassignment", False))
transcript_corrected: Optional[str]= scene.get("transcript_corrected") or None transcript_corrected: Optional[str]= scene.get("transcript_corrected") or None
segment_indices: Optional[list] = scene.get("segment_indices") segment_indices: Optional[list] = scene.get("segment_indices")
@@ -160,6 +163,7 @@ async def extract_scenes(
"units": units, "units": units,
"severity": severity, "severity": severity,
"resolved": resolved, "resolved": resolved,
"reassignment": reassignment,
"transcript_corrected": transcript_corrected, "transcript_corrected": transcript_corrected,
"segment_indices": segment_indices, "segment_indices": segment_indices,
"embedding": embedding, "embedding": embedding,
+6 -2
View File
@@ -123,8 +123,6 @@ def _sync_transcribe(
response_format="verbose_json", response_format="verbose_json",
temperature=0, temperature=0,
) )
text = response.text.strip() or None
# Filter hallucinated segments. Two sources of hallucination in P25 recordings: # Filter hallucinated segments. Two sources of hallucination in P25 recordings:
# #
# 1. Trailing silence / static — Whisper fills silence past real content with # 1. Trailing silence / static — Whisper fills silence past real content with
@@ -142,6 +140,12 @@ def _sync_transcribe(
and s.start < audio_duration and s.start < audio_duration
and getattr(s, "no_speech_prob", 0.0) < 0.8 and getattr(s, "no_speech_prob", 0.0) < 0.8
] ]
# Reconstruct text from non-hallucinated segments only so the two stay
# in sync. If every segment was filtered (e.g. pure static or repeated
# prompt-word hallucination like "Standby. Standby. Standby..."), text
# becomes None which prevents the intelligence pipeline from running on
# hallucinated content.
text = " ".join(s["text"] for s in segments) or None
return text, segments return text, segments
finally: finally:
try: try:
+8
View File
@@ -110,6 +110,9 @@ async def _run_extraction_pipeline(
all_tags: list[str] = [] all_tags: list[str] = []
for scene in scenes: for scene in scenes:
all_tags.extend(scene["tags"]) all_tags.extend(scene["tags"])
# When dispatch is pulling a unit to a NEW call (reassignment), suppress unit
# overlap so the new scene doesn't chain into the unit's previous incident.
corr_units = [] if scene.get("reassignment") else scene.get("units")
incident_id = await incident_correlator.correlate_call( incident_id = await incident_correlator.correlate_call(
call_id=call_id, call_id=call_id,
node_id=node_id, node_id=node_id,
@@ -120,6 +123,8 @@ async def _run_extraction_pipeline(
incident_type=scene["incident_type"], incident_type=scene["incident_type"],
location=scene["location"], location=scene["location"],
location_coords=scene["location_coords"], location_coords=scene["location_coords"],
units=corr_units,
vehicles=scene.get("vehicles"),
) )
if incident_id and incident_id not in incident_ids: if incident_id and incident_id not in incident_ids:
incident_ids.append(incident_id) incident_ids.append(incident_id)
@@ -206,6 +211,7 @@ async def _run_intelligence_pipeline(
if flags["correlation_enabled"]: if flags["correlation_enabled"]:
for scene in scenes: for scene in scenes:
all_tags.extend(scene["tags"]) all_tags.extend(scene["tags"])
corr_units = [] if scene.get("reassignment") else scene.get("units")
incident_id = await incident_correlator.correlate_call( incident_id = await incident_correlator.correlate_call(
call_id=call_id, call_id=call_id,
node_id=node_id, node_id=node_id,
@@ -216,6 +222,8 @@ async def _run_intelligence_pipeline(
incident_type=scene["incident_type"], incident_type=scene["incident_type"],
location=scene["location"], location=scene["location"],
location_coords=scene["location_coords"], location_coords=scene["location_coords"],
units=corr_units,
vehicles=scene.get("vehicles"),
) )
if incident_id and incident_id not in incident_ids: if incident_id and incident_id not in incident_ids:
incident_ids.append(incident_id) incident_ids.append(incident_id)
+23
View File
@@ -138,6 +138,29 @@ export function CallRow({ call, systemName, isAdmin }: Props) {
</div> </div>
)} )}
{/* Correlation debug — admin only */}
{isAdmin && call.corr_path && (
<div className="flex flex-wrap gap-x-3 gap-y-0.5 text-xs font-mono text-gray-600">
<span>corr:</span>
<span className="text-gray-400">{call.corr_path}</span>
{call.corr_incident_idle_min != null && (
<span>idle {call.corr_incident_idle_min}min</span>
)}
{call.corr_score != null && (
<span>sim={call.corr_score.toFixed(3)}</span>
)}
{call.corr_distance_km != null && (
<span>dist={call.corr_distance_km}km</span>
)}
{call.corr_shared_units != null && (
<span>{call.corr_shared_units} shared units</span>
)}
{call.corr_candidates != null && (
<span>{call.corr_candidates} candidates</span>
)}
</div>
)}
{/* Transcript */} {/* Transcript */}
{editing ? ( {editing ? (
<div className="space-y-2" onClick={(e) => e.stopPropagation()}> <div className="space-y-2" onClick={(e) => e.stopPropagation()}>
+7
View File
@@ -56,6 +56,13 @@ export interface CallRecord {
location: string | null; location: string | null;
tags: string[]; tags: string[];
status: "active" | "ended"; status: "active" | "ended";
// Correlation debug — written by the correlator, present after a call is linked
corr_path?: string | null;
corr_score?: number | null;
corr_distance_km?: number | null;
corr_incident_idle_min?: number | null;
corr_shared_units?: number | null;
corr_candidates?: number | null;
} }
export interface IncidentRecord { export interface IncidentRecord {