Update intelligence
This commit is contained in:
@@ -1,106 +1,140 @@
|
||||
"""
|
||||
Rules-based intelligence extraction from call transcripts.
|
||||
Gemini-powered intelligence extraction from call transcripts.
|
||||
|
||||
Scans a transcript for known incident keywords, categorises the call, and
|
||||
extracts rough location hints (street/intersection mentions).
|
||||
Sends the transcript to Gemini Flash with a tight JSON schema prompt.
|
||||
Returns structured data: incident type, tags, location, vehicles, units, severity.
|
||||
|
||||
No external ML dependencies — fast and always available even when STT is
|
||||
disabled. Designed to run as part of the post-upload background pipeline.
|
||||
Falls back gracefully if Gemini is unavailable or returns malformed output.
|
||||
"""
|
||||
import re
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional
|
||||
from app.internal.logger import logger
|
||||
from app.internal import firestore as fstore
|
||||
|
||||
_PROMPT_TEMPLATE = """You are analyzing a P25 public safety radio transcript. Extract structured information and respond ONLY with a single valid JSON object — no markdown, no explanation.
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Keyword taxonomy
|
||||
# ---------------------------------------------------------------------------
|
||||
Schema:
|
||||
{{
|
||||
"incident_type": one of "fire" | "ems" | "police" | "accident" | "other" | "unknown",
|
||||
"tags": [list of specific descriptive tags, max 6, e.g. "two-car mva", "property-damage-only", "working fire", "shots-fired"],
|
||||
"location": "most specific location string found, or empty string",
|
||||
"vehicles": [vehicle descriptions mentioned, e.g. "Hyundai Tucson", "black sedan"],
|
||||
"units": [unit IDs or officer numbers mentioned, e.g. "Unit 511", "Car 4"],
|
||||
"severity": one of "minor" | "moderate" | "major" | "unknown"
|
||||
}}
|
||||
|
||||
INCIDENT_KEYWORDS: dict[str, list[str]] = {
|
||||
"fire": [
|
||||
"fire", "smoke", "flames", "burning", "structure fire", "brush fire",
|
||||
"wildfire", "arson", "working fire", "fully involved",
|
||||
],
|
||||
"ems": [
|
||||
"cardiac", "unconscious", "breathing", "overdose", "trauma",
|
||||
"injury", "ambulance", "ems", "medic", "chest pain", "stroke",
|
||||
"unresponsive", "fall", "laceration",
|
||||
],
|
||||
"police": [
|
||||
"pursuit", "chase", "shots fired", "weapon", "suspect", "robbery",
|
||||
"assault", "burglary", "stolen", "fleeing", "armed", "shooting",
|
||||
"stabbing", "domestic",
|
||||
],
|
||||
"accident": [
|
||||
"accident", "collision", "crash", "mvr", "vehicle", "rollover",
|
||||
"hit and run", "ped", "pedestrian", "pi", "property damage",
|
||||
],
|
||||
}
|
||||
Rules:
|
||||
- location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none.
|
||||
- tags: be specific and lowercase, hyphenated. Do not repeat incident_type as a tag.
|
||||
- units: only identifiers explicitly mentioned, not inferred.
|
||||
- Do not invent details not present in the transcript.
|
||||
|
||||
# Street suffix patterns for location extraction
|
||||
_STREET_RE = re.compile(
|
||||
r'\b(?:\d+\s+)?[A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*'
|
||||
r'\s+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln'
|
||||
r'|Court|Ct|Place|Pl|Way|Circle|Cir|Highway|Hwy|Route|Rt)\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
Transcript:
|
||||
{transcript}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def extract_tags(
|
||||
call_id: str,
|
||||
transcript: str,
|
||||
) -> tuple[list[str], Optional[str]]:
|
||||
) -> tuple[list[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract incident tags from a transcript.
|
||||
Extract incident tags, type, and location from a transcript via Gemini.
|
||||
|
||||
Returns:
|
||||
(tags, primary_type) — e.g. (["fire", "structure fire"], "fire")
|
||||
primary_type is the category with the most keyword hits, or None.
|
||||
(tags, primary_type, location)
|
||||
|
||||
Side-effect: updates calls/{call_id}.tags in Firestore.
|
||||
Side-effect: updates calls/{call_id} in Firestore with tags, location,
|
||||
vehicles, units, severity; also stores the call embedding.
|
||||
"""
|
||||
lower = transcript.lower()
|
||||
matched: dict[str, list[str]] = {}
|
||||
result = await asyncio.to_thread(_sync_extract, transcript)
|
||||
|
||||
for category, keywords in INCIDENT_KEYWORDS.items():
|
||||
hits = [kw for kw in keywords if kw in lower]
|
||||
if hits:
|
||||
matched[category] = hits
|
||||
tags: list[str] = result.get("tags") or []
|
||||
incident_type: Optional[str] = result.get("incident_type") or None
|
||||
location: Optional[str] = result.get("location") or None
|
||||
vehicles: list[str] = result.get("vehicles") or []
|
||||
units: list[str] = result.get("units") or []
|
||||
severity: str = result.get("severity") or "unknown"
|
||||
|
||||
tags: list[str] = []
|
||||
for category, hits in matched.items():
|
||||
tags.append(category)
|
||||
tags.extend(h for h in hits if h != category)
|
||||
if incident_type in ("unknown", "other", ""):
|
||||
incident_type = None
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen: set[str] = set()
|
||||
unique_tags: list[str] = []
|
||||
for t in tags:
|
||||
if t not in seen:
|
||||
seen.add(t)
|
||||
unique_tags.append(t)
|
||||
# Store embedding alongside structured data
|
||||
embedding = await asyncio.to_thread(_sync_embed, _embed_text(transcript, incident_type))
|
||||
|
||||
# Primary type = category with most keyword hits
|
||||
primary_type: Optional[str] = None
|
||||
if matched:
|
||||
primary_type = max(matched, key=lambda c: len(matched[c]))
|
||||
updates: dict = {
|
||||
"tags": tags,
|
||||
"severity": severity,
|
||||
}
|
||||
if location:
|
||||
updates["location"] = location
|
||||
if vehicles:
|
||||
updates["vehicles"] = vehicles
|
||||
if units:
|
||||
updates["units"] = units
|
||||
if embedding:
|
||||
updates["embedding"] = embedding
|
||||
|
||||
if unique_tags:
|
||||
try:
|
||||
await fstore.doc_update("calls", call_id, {"tags": unique_tags})
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not save tags for call {call_id}: {e}")
|
||||
try:
|
||||
await fstore.doc_set("calls", call_id, updates)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not save intelligence for call {call_id}: {e}")
|
||||
|
||||
logger.info(f"Intelligence: call {call_id} → tags={unique_tags}, type={primary_type}")
|
||||
return unique_tags, primary_type
|
||||
logger.info(
|
||||
f"Intelligence: call {call_id} → type={incident_type}, "
|
||||
f"tags={tags}, location={location!r}, severity={severity}"
|
||||
)
|
||||
return tags, incident_type, location
|
||||
|
||||
|
||||
def extract_location_hint(transcript: str) -> Optional[str]:
|
||||
"""Return the first street-level location mention found in the transcript, or None."""
|
||||
match = _STREET_RE.search(transcript)
|
||||
return match.group(0) if match else None
|
||||
def _sync_extract(transcript: str) -> dict:
|
||||
"""Call Gemini Flash and parse the JSON response."""
|
||||
from app.config import settings
|
||||
import google.generativeai as genai
|
||||
|
||||
if not settings.gemini_api_key:
|
||||
logger.warning("GEMINI_API_KEY not set — intelligence extraction disabled.")
|
||||
return {}
|
||||
|
||||
genai.configure(api_key=settings.gemini_api_key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-1.5-flash",
|
||||
generation_config={"response_mime_type": "application/json"},
|
||||
)
|
||||
|
||||
try:
|
||||
response = model.generate_content(_PROMPT_TEMPLATE.format(transcript=transcript))
|
||||
return json.loads(response.text)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Gemini returned non-JSON: {e}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.warning(f"Gemini extraction failed: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def _sync_embed(text: str) -> Optional[list[float]]:
|
||||
"""Generate a text-embedding-004 vector for semantic similarity."""
|
||||
from app.config import settings
|
||||
import google.generativeai as genai
|
||||
|
||||
if not settings.gemini_api_key:
|
||||
return None
|
||||
|
||||
genai.configure(api_key=settings.gemini_api_key)
|
||||
try:
|
||||
result = genai.embed_content(
|
||||
model="models/text-embedding-004",
|
||||
content=text,
|
||||
task_type="SEMANTIC_SIMILARITY",
|
||||
)
|
||||
return result["embedding"]
|
||||
except Exception as e:
|
||||
logger.warning(f"Embedding generation failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _embed_text(transcript: str, incident_type: Optional[str]) -> str:
|
||||
"""Build the text string to embed — transcript + type context."""
|
||||
prefix = f"[{incident_type}] " if incident_type else ""
|
||||
return f"{prefix}{transcript}"
|
||||
|
||||
Reference in New Issue
Block a user