Update intelligence

This commit is contained in:
Logan
2026-04-12 23:33:44 -04:00
parent 757bfe82e0
commit 7b6fd640d9
8 changed files with 456 additions and 141 deletions
+111 -77
View File
@@ -1,106 +1,140 @@
"""
Rules-based intelligence extraction from call transcripts.
Gemini-powered intelligence extraction from call transcripts.
Scans a transcript for known incident keywords, categorises the call, and
extracts rough location hints (street/intersection mentions).
Sends the transcript to Gemini Flash with a tight JSON schema prompt.
Returns structured data: incident type, tags, location, vehicles, units, severity.
No external ML dependencies — fast and always available even when STT is
disabled. Designed to run as part of the post-upload background pipeline.
Falls back gracefully if Gemini is unavailable or returns malformed output.
"""
import re
import asyncio
import json
from typing import Optional
from app.internal.logger import logger
from app.internal import firestore as fstore
_PROMPT_TEMPLATE = """You are analyzing a P25 public safety radio transcript. Extract structured information and respond ONLY with a single valid JSON object — no markdown, no explanation.
# ---------------------------------------------------------------------------
# Keyword taxonomy
# ---------------------------------------------------------------------------
Schema:
{{
"incident_type": one of "fire" | "ems" | "police" | "accident" | "other" | "unknown",
"tags": [list of specific descriptive tags, max 6, e.g. "two-car mva", "property-damage-only", "working fire", "shots-fired"],
"location": "most specific location string found, or empty string",
"vehicles": [vehicle descriptions mentioned, e.g. "Hyundai Tucson", "black sedan"],
"units": [unit IDs or officer numbers mentioned, e.g. "Unit 511", "Car 4"],
"severity": one of "minor" | "moderate" | "major" | "unknown"
}}
INCIDENT_KEYWORDS: dict[str, list[str]] = {
"fire": [
"fire", "smoke", "flames", "burning", "structure fire", "brush fire",
"wildfire", "arson", "working fire", "fully involved",
],
"ems": [
"cardiac", "unconscious", "breathing", "overdose", "trauma",
"injury", "ambulance", "ems", "medic", "chest pain", "stroke",
"unresponsive", "fall", "laceration",
],
"police": [
"pursuit", "chase", "shots fired", "weapon", "suspect", "robbery",
"assault", "burglary", "stolen", "fleeing", "armed", "shooting",
"stabbing", "domestic",
],
"accident": [
"accident", "collision", "crash", "mvr", "vehicle", "rollover",
"hit and run", "ped", "pedestrian", "pi", "property damage",
],
}
Rules:
- location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none.
- tags: be specific and lowercase, hyphenated. Do not repeat incident_type as a tag.
- units: only identifiers explicitly mentioned, not inferred.
- Do not invent details not present in the transcript.
# Street suffix patterns for location extraction
_STREET_RE = re.compile(
r'\b(?:\d+\s+)?[A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*'
r'\s+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln'
r'|Court|Ct|Place|Pl|Way|Circle|Cir|Highway|Hwy|Route|Rt)\b',
re.IGNORECASE,
)
Transcript:
{transcript}"""
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def extract_tags(
call_id: str,
transcript: str,
) -> tuple[list[str], Optional[str]]:
) -> tuple[list[str], Optional[str], Optional[str]]:
"""
Extract incident tags from a transcript.
Extract incident tags, type, and location from a transcript via Gemini.
Returns:
(tags, primary_type) — e.g. (["fire", "structure fire"], "fire")
primary_type is the category with the most keyword hits, or None.
(tags, primary_type, location)
Side-effect: updates calls/{call_id}.tags in Firestore.
Side-effect: updates calls/{call_id} in Firestore with tags, location,
vehicles, units, severity; also stores the call embedding.
"""
lower = transcript.lower()
matched: dict[str, list[str]] = {}
result = await asyncio.to_thread(_sync_extract, transcript)
for category, keywords in INCIDENT_KEYWORDS.items():
hits = [kw for kw in keywords if kw in lower]
if hits:
matched[category] = hits
tags: list[str] = result.get("tags") or []
incident_type: Optional[str] = result.get("incident_type") or None
location: Optional[str] = result.get("location") or None
vehicles: list[str] = result.get("vehicles") or []
units: list[str] = result.get("units") or []
severity: str = result.get("severity") or "unknown"
tags: list[str] = []
for category, hits in matched.items():
tags.append(category)
tags.extend(h for h in hits if h != category)
if incident_type in ("unknown", "other", ""):
incident_type = None
# Deduplicate while preserving order
seen: set[str] = set()
unique_tags: list[str] = []
for t in tags:
if t not in seen:
seen.add(t)
unique_tags.append(t)
# Store embedding alongside structured data
embedding = await asyncio.to_thread(_sync_embed, _embed_text(transcript, incident_type))
# Primary type = category with most keyword hits
primary_type: Optional[str] = None
if matched:
primary_type = max(matched, key=lambda c: len(matched[c]))
updates: dict = {
"tags": tags,
"severity": severity,
}
if location:
updates["location"] = location
if vehicles:
updates["vehicles"] = vehicles
if units:
updates["units"] = units
if embedding:
updates["embedding"] = embedding
if unique_tags:
try:
await fstore.doc_update("calls", call_id, {"tags": unique_tags})
except Exception as e:
logger.warning(f"Could not save tags for call {call_id}: {e}")
try:
await fstore.doc_set("calls", call_id, updates)
except Exception as e:
logger.warning(f"Could not save intelligence for call {call_id}: {e}")
logger.info(f"Intelligence: call {call_id} → tags={unique_tags}, type={primary_type}")
return unique_tags, primary_type
logger.info(
f"Intelligence: call {call_id} → type={incident_type}, "
f"tags={tags}, location={location!r}, severity={severity}"
)
return tags, incident_type, location
def extract_location_hint(transcript: str) -> Optional[str]:
"""Return the first street-level location mention found in the transcript, or None."""
match = _STREET_RE.search(transcript)
return match.group(0) if match else None
def _sync_extract(transcript: str) -> dict:
"""Call Gemini Flash and parse the JSON response."""
from app.config import settings
import google.generativeai as genai
if not settings.gemini_api_key:
logger.warning("GEMINI_API_KEY not set — intelligence extraction disabled.")
return {}
genai.configure(api_key=settings.gemini_api_key)
model = genai.GenerativeModel(
"gemini-1.5-flash",
generation_config={"response_mime_type": "application/json"},
)
try:
response = model.generate_content(_PROMPT_TEMPLATE.format(transcript=transcript))
return json.loads(response.text)
except json.JSONDecodeError as e:
logger.warning(f"Gemini returned non-JSON: {e}")
return {}
except Exception as e:
logger.warning(f"Gemini extraction failed: {e}")
return {}
def _sync_embed(text: str) -> Optional[list[float]]:
"""Generate a text-embedding-004 vector for semantic similarity."""
from app.config import settings
import google.generativeai as genai
if not settings.gemini_api_key:
return None
genai.configure(api_key=settings.gemini_api_key)
try:
result = genai.embed_content(
model="models/text-embedding-004",
content=text,
task_type="SEMANTIC_SIMILARITY",
)
return result["embedding"]
except Exception as e:
logger.warning(f"Embedding generation failed: {e}")
return None
def _embed_text(transcript: str, incident_type: Optional[str]) -> str:
"""Build the text string to embed — transcript + type context."""
prefix = f"[{incident_type}] " if incident_type else ""
return f"{prefix}{transcript}"