diff --git a/drb-c2-core/app/internal/intelligence.py b/drb-c2-core/app/internal/intelligence.py index 198b2e9..460e291 100644 --- a/drb-c2-core/app/internal/intelligence.py +++ b/drb-c2-core/app/internal/intelligence.py @@ -12,7 +12,7 @@ from typing import Optional from app.internal.logger import logger from app.internal import firestore as fstore -_PROMPT_TEMPLATE = """You are analyzing a P25 public safety radio transcript. The audio was transcribed by Whisper through a digital radio vocoder, which introduces errors. Extract structured information and respond ONLY with a single valid JSON object — no markdown, no explanation. +_PROMPT_TEMPLATE = """You are analyzing a P25 public safety radio recording. The audio was transcribed by Whisper through a digital radio vocoder, which introduces errors. Each numbered transmission is a separate PTT press from a different radio. Extract structured information and respond ONLY with a single valid JSON object — no markdown, no explanation. Schema: {{ @@ -22,7 +22,7 @@ Schema: "vehicles": [vehicle descriptions mentioned, e.g. "Hyundai Tucson", "black sedan"], "units": [unit IDs or officer numbers mentioned, e.g. "Unit 511", "Car 4"], "severity": one of "minor" | "moderate" | "major" | "unknown", - "transcript_corrected": "corrected transcript string, or null if no corrections needed" + "transcript_corrected": "corrected full transcript string, or null if no corrections needed" }} Rules: @@ -30,17 +30,20 @@ Rules: - tags: be specific and lowercase, hyphenated. Do not repeat incident_type as a tag. - units: only identifiers explicitly mentioned, not inferred. - Do not invent details not present in the transcript. -- transcript_corrected: fix only clear STT errors caused by vocoder distortion (e.g. "Several" → "10-4", misheard street names, garbled unit IDs). Keep all radio language as-is — do NOT decode codes into plain English. Return null if the transcript looks accurate. +- transcript_corrected: fix only clear STT errors caused by vocoder distortion (e.g. "Several" → "10-4", misheard street names, garbled unit IDs). Use the back-and-forth context between transmissions to resolve ambiguities. Keep all radio language as-is — do NOT decode codes into plain English. Return null if the transcript looks accurate. +System: {system_id} Talkgroup: {talkgroup_name} -Transcript: -{transcript}""" +{transcript_block}""" async def extract_tags( call_id: str, transcript: str, talkgroup_name: Optional[str] = None, + talkgroup_id: Optional[int] = None, + system_id: Optional[str] = None, + segments: Optional[list[dict]] = None, ) -> tuple[list[str], Optional[str], Optional[str]]: """ Extract incident tags, type, location, and corrected transcript via Gemini. @@ -51,7 +54,7 @@ async def extract_tags( Side-effect: updates calls/{call_id} in Firestore with tags, location, vehicles, units, severity, transcript_corrected; also stores the call embedding. """ - result = await asyncio.to_thread(_sync_extract, transcript, talkgroup_name) + result = await asyncio.to_thread(_sync_extract, transcript, talkgroup_name, talkgroup_id, system_id, segments) tags: list[str] = result.get("tags") or [] incident_type: Optional[str] = result.get("incident_type") or None @@ -95,7 +98,21 @@ async def extract_tags( return tags, incident_type, location -def _sync_extract(transcript: str, talkgroup_name: Optional[str]) -> dict: +def _build_transcript_block(transcript: str, segments: Optional[list[dict]]) -> str: + """Format transcript as numbered transmissions if segments are available.""" + if segments and len(segments) > 1: + lines = [f"{i+1}. [{s['start']}s] {s['text']}" for i, s in enumerate(segments)] + return f"Transmissions ({len(segments)}):\n" + "\n".join(lines) + return f"Transcript:\n{transcript}" + + +def _sync_extract( + transcript: str, + talkgroup_name: Optional[str], + talkgroup_id: Optional[int], + system_id: Optional[str], + segments: Optional[list[dict]], +) -> dict: """Call Gemini Flash and parse the JSON response.""" from app.config import settings import google.generativeai as genai @@ -110,9 +127,11 @@ def _sync_extract(transcript: str, talkgroup_name: Optional[str]) -> dict: generation_config={"response_mime_type": "application/json"}, ) + tg = f"{talkgroup_name} (TGID {talkgroup_id})" if talkgroup_id else (talkgroup_name or "unknown") prompt = _PROMPT_TEMPLATE.format( - transcript=transcript, - talkgroup_name=talkgroup_name or "unknown", + transcript_block=_build_transcript_block(transcript, segments), + talkgroup_name=tg, + system_id=system_id or "unknown", ) try: diff --git a/drb-c2-core/app/internal/transcription.py b/drb-c2-core/app/internal/transcription.py index 37e4bd1..81cadc5 100644 --- a/drb-c2-core/app/internal/transcription.py +++ b/drb-c2-core/app/internal/transcription.py @@ -28,38 +28,40 @@ async def transcribe_call( call_id: str, gcs_uri: str, talkgroup_name: Optional[str] = None, -) -> Optional[str]: +) -> tuple[Optional[str], list[dict]]: """ Transcribe audio at the given GCS URI and store the result in Firestore. - Args: - call_id: Firestore document ID in the 'calls' collection. - gcs_uri: GCS URI of the audio file, e.g. gs://bucket/calls/xyz.mp3 - talkgroup_name: Passed through to the intelligence layer; unused here. - Returns: - The raw Whisper transcript string, or None if transcription failed. + (transcript, segments) — segments is a list of {start, end, text} dicts, + one per detected transmission. Empty list if transcription failed. """ if not gcs_uri or not gcs_uri.startswith("gs://"): - return None + return None, [] try: - transcript = await asyncio.to_thread(_sync_transcribe, gcs_uri) + transcript, segments = await asyncio.to_thread(_sync_transcribe, gcs_uri) except Exception as e: logger.warning(f"Transcription failed for call {call_id}: {e}") - return None + return None, [] if transcript: + updates: dict = {"transcript": transcript} + if segments: + updates["segments"] = segments try: - await fstore.doc_set("calls", call_id, {"transcript": transcript}) - logger.info(f"Transcript saved for call {call_id} ({len(transcript)} chars)") + await fstore.doc_set("calls", call_id, updates) + logger.info( + f"Transcript saved for call {call_id} " + f"({len(transcript)} chars, {len(segments)} segment(s))" + ) except Exception as e: logger.warning(f"Could not save transcript for {call_id}: {e}") - return transcript + return transcript, segments -def _sync_transcribe(gcs_uri: str) -> Optional[str]: +def _sync_transcribe(gcs_uri: str) -> tuple[Optional[str], list[dict]]: """Download audio from GCS and transcribe with OpenAI Whisper.""" from google.cloud import storage as gcs from google.oauth2 import service_account @@ -99,8 +101,15 @@ def _sync_transcribe(gcs_uri: str) -> Optional[str]: file=f, language="en", prompt=_WHISPER_PROMPT, + response_format="verbose_json", ) - return response.text.strip() or None + text = response.text.strip() or None + segments = [ + {"start": round(s.start, 2), "end": round(s.end, 2), "text": s.text.strip()} + for s in (response.segments or []) + if s.text.strip() + ] + return text, segments finally: try: os.unlink(tmp_path) diff --git a/drb-c2-core/app/routers/upload.py b/drb-c2-core/app/routers/upload.py index 188f1e0..27bd2db 100644 --- a/drb-c2-core/app/routers/upload.py +++ b/drb-c2-core/app/routers/upload.py @@ -101,17 +101,21 @@ async def _run_intelligence_pipeline( from app.internal import transcription, intelligence, incident_correlator, alerter transcript: Optional[str] = None + segments: list[dict] = [] # Step 1: Transcription if gcs_uri: - transcript = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name) + transcript, segments = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name) # Step 2: Intelligence extraction tags: list[str] = [] incident_type: Optional[str] = None location: Optional[str] = None if transcript: - tags, incident_type, location = await intelligence.extract_tags(call_id, transcript, talkgroup_name) + tags, incident_type, location = await intelligence.extract_tags( + call_id, transcript, talkgroup_name, + talkgroup_id=talkgroup_id, system_id=system_id, segments=segments, + ) # Step 3: Incident correlation if incident_type: