From f6897566f86c0020e3411012a398eb42890ff50e Mon Sep 17 00:00:00 2001 From: Logan Date: Mon, 4 May 2026 01:13:18 -0400 Subject: [PATCH] Fix tags, titles, and hallucinations --- .../app/internal/incident_correlator.py | 18 +++++++++++++----- drb-c2-core/app/internal/transcription.py | 8 ++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/drb-c2-core/app/internal/incident_correlator.py b/drb-c2-core/app/internal/incident_correlator.py index b602c43..8ac0fb0 100644 --- a/drb-c2-core/app/internal/incident_correlator.py +++ b/drb-c2-core/app/internal/incident_correlator.py @@ -49,6 +49,14 @@ from app.config import settings _DISPATCH_TG_RE = re.compile(r"\bdispatch\b|\bdisp\b", re.IGNORECASE) +def _tag_to_title(tag: str) -> str: + """ + Convert a hyphenated tag to title case without the str.title() apostrophe bug. + e.g. "lower-macy's" → "Lower Macy's" (not "Lower Macy'S") + """ + return " ".join(w.capitalize() for w in tag.replace("-", " ").split()) + + def _is_dispatch_channel(talkgroup_name: Optional[str]) -> bool: """True when the talkgroup is a shared dispatch backbone (not a tactical/working channel).""" if not talkgroup_name: @@ -474,12 +482,12 @@ async def _update_incident( # Routine status calls (type=None) do not clobber the title. if incident_type: content_tags = [t for t in tags if t != "auto-generated"] - primary_tag = content_tags[0].replace("-", " ").title() if content_tags else None + primary_tag = _tag_to_title(content_tags[0]) if content_tags else None tg_label = ( talkgroup_name or (f"TGID {talkgroup_id}" if talkgroup_id else inc.get("title", "").split(" — ")[-1]) ) - if primary_tag and best_location: + if primary_tag and best_location and primary_tag.lower() != best_location.lower(): updates["title"] = f"{primary_tag} at {best_location}" elif primary_tag and tg_label: updates["title"] = f"{primary_tag} — {tg_label}" @@ -513,13 +521,13 @@ async def _create_incident( # Build a descriptive title from tags + location when available content_tags = [t for t in tags if t != "auto-generated"] - primary_tag = content_tags[0].replace("-", " ").title() if content_tags else None - if primary_tag and location: + primary_tag = _tag_to_title(content_tags[0]) if content_tags else None + if primary_tag and location and primary_tag.lower() != location.lower(): title = f"{primary_tag} at {location}" elif primary_tag: title = f"{primary_tag} — {tg_label}" else: - title = f"{incident_type.title()} — {tg_label}" + title = f"{_tag_to_title(incident_type)} — {tg_label}" doc = { "incident_id": incident_id, diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py index 7a20feb..3dff828 100644 --- a/drb-c2-core/app/internal/transcription.py +++ b/drb-c2-core/app/internal/transcription.py @@ -123,8 +123,6 @@ def _sync_transcribe( response_format="verbose_json", temperature=0, ) - text = response.text.strip() or None - # Filter hallucinated segments. Two sources of hallucination in P25 recordings: # # 1. Trailing silence / static — Whisper fills silence past real content with @@ -142,6 +140,12 @@ def _sync_transcribe( and s.start < audio_duration and getattr(s, "no_speech_prob", 0.0) < 0.8 ] + # Reconstruct text from non-hallucinated segments only so the two stay + # in sync. If every segment was filtered (e.g. pure static or repeated + # prompt-word hallucination like "Standby. Standby. Standby..."), text + # becomes None which prevents the intelligence pipeline from running on + # hallucinated content. + text = " ".join(s["text"] for s in segments) or None return text, segments finally: try: