Update intelligence

2026-04-12 23:33:44 -04:00
parent 757bfe82e0
commit 7b6fd640d9
8 changed files with 456 additions and 141 deletions
@@ -1,106 +1,140 @@
 """
-Rules-based intelligence extraction from call transcripts.
+Gemini-powered intelligence extraction from call transcripts.

-Scans a transcript for known incident keywords, categorises the call, and
-extracts rough location hints (street/intersection mentions).
+Sends the transcript to Gemini Flash with a tight JSON schema prompt.
+Returns structured data: incident type, tags, location, vehicles, units, severity.

-No external ML dependencies — fast and always available even when STT is
-disabled. Designed to run as part of the post-upload background pipeline.
+Falls back gracefully if Gemini is unavailable or returns malformed output.
 """
-import re
+import asyncio
+import json
 from typing import Optional
 from app.internal.logger import logger
 from app.internal import firestore as fstore

+_PROMPT_TEMPLATE = """You are analyzing a P25 public safety radio transcript. Extract structured information and respond ONLY with a single valid JSON object — no markdown, no explanation.

-# ---------------------------------------------------------------------------
-# Keyword taxonomy
-# ---------------------------------------------------------------------------
+Schema:
+{{
+  "incident_type": one of "fire" | "ems" | "police" | "accident" | "other" | "unknown",
+  "tags": [list of specific descriptive tags, max 6, e.g. "two-car mva", "property-damage-only", "working fire", "shots-fired"],
+  "location": "most specific location string found, or empty string",
+  "vehicles": [vehicle descriptions mentioned, e.g. "Hyundai Tucson", "black sedan"],
+  "units": [unit IDs or officer numbers mentioned, e.g. "Unit 511", "Car 4"],
+  "severity": one of "minor" | "moderate" | "major" | "unknown"
+}}

-INCIDENT_KEYWORDS: dict[str, list[str]] = {
-    "fire": [
-        "fire", "smoke", "flames", "burning", "structure fire", "brush fire",
-        "wildfire", "arson", "working fire", "fully involved",
-    ],
-    "ems": [
-        "cardiac", "unconscious", "breathing", "overdose", "trauma",
-        "injury", "ambulance", "ems", "medic", "chest pain", "stroke",
-        "unresponsive", "fall", "laceration",
-    ],
-    "police": [
-        "pursuit", "chase", "shots fired", "weapon", "suspect", "robbery",
-        "assault", "burglary", "stolen", "fleeing", "armed", "shooting",
-        "stabbing", "domestic",
-    ],
-    "accident": [
-        "accident", "collision", "crash", "mvr", "vehicle", "rollover",
-        "hit and run", "ped", "pedestrian", "pi", "property damage",
-    ],
-}
+Rules:
+- location: prefer intersections > addresses > mile markers > route+town > route alone > town alone. Empty string if none.
+- tags: be specific and lowercase, hyphenated. Do not repeat incident_type as a tag.
+- units: only identifiers explicitly mentioned, not inferred.
+- Do not invent details not present in the transcript.

-# Street suffix patterns for location extraction
-_STREET_RE = re.compile(
-    r'\b(?:\d+\s+)?[A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*'
-    r'\s+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Drive|Dr|Road|Rd|Lane|Ln'
-    r'|Court|Ct|Place|Pl|Way|Circle|Cir|Highway|Hwy|Route|Rt)\b',
-    re.IGNORECASE,
-)
+Transcript:
+{transcript}"""


-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-
 async def extract_tags(
    call_id: str,
    transcript: str,
-) -> tuple[list[str], Optional[str]]:
+) -> tuple[list[str], Optional[str], Optional[str]]:
    """
-    Extract incident tags from a transcript.
+    Extract incident tags, type, and location from a transcript via Gemini.

    Returns:
-        (tags, primary_type) — e.g. (["fire", "structure fire"], "fire")
-        primary_type is the category with the most keyword hits, or None.
+        (tags, primary_type, location)

-    Side-effect: updates calls/{call_id}.tags in Firestore.
+    Side-effect: updates calls/{call_id} in Firestore with tags, location,
+    vehicles, units, severity; also stores the call embedding.
    """
-    lower = transcript.lower()
-    matched: dict[str, list[str]] = {}
+    result = await asyncio.to_thread(_sync_extract, transcript)

-    for category, keywords in INCIDENT_KEYWORDS.items():
-        hits = [kw for kw in keywords if kw in lower]
-        if hits:
-            matched[category] = hits
+    tags: list[str] = result.get("tags") or []
+    incident_type: Optional[str] = result.get("incident_type") or None
+    location: Optional[str] = result.get("location") or None
+    vehicles: list[str] = result.get("vehicles") or []
+    units: list[str] = result.get("units") or []
+    severity: str = result.get("severity") or "unknown"

-    tags: list[str] = []
-    for category, hits in matched.items():
-        tags.append(category)
-        tags.extend(h for h in hits if h != category)
+    if incident_type in ("unknown", "other", ""):
+        incident_type = None

-    # Deduplicate while preserving order
-    seen: set[str] = set()
-    unique_tags: list[str] = []
-    for t in tags:
-        if t not in seen:
-            seen.add(t)
-            unique_tags.append(t)
+    # Store embedding alongside structured data
+    embedding = await asyncio.to_thread(_sync_embed, _embed_text(transcript, incident_type))

-    # Primary type = category with most keyword hits
-    primary_type: Optional[str] = None
-    if matched:
-        primary_type = max(matched, key=lambda c: len(matched[c]))
+    updates: dict = {
+        "tags": tags,
+        "severity": severity,
+    }
+    if location:
+        updates["location"] = location
+    if vehicles:
+        updates["vehicles"] = vehicles
+    if units:
+        updates["units"] = units
+    if embedding:
+        updates["embedding"] = embedding

-    if unique_tags:
-        try:
-            await fstore.doc_update("calls", call_id, {"tags": unique_tags})
-        except Exception as e:
-            logger.warning(f"Could not save tags for call {call_id}: {e}")
+    try:
+        await fstore.doc_set("calls", call_id, updates)
+    except Exception as e:
+        logger.warning(f"Could not save intelligence for call {call_id}: {e}")

-    logger.info(f"Intelligence: call {call_id} → tags={unique_tags}, type={primary_type}")
-    return unique_tags, primary_type
+    logger.info(
+        f"Intelligence: call {call_id} → type={incident_type}, "
+        f"tags={tags}, location={location!r}, severity={severity}"
+    )
+    return tags, incident_type, location


-def extract_location_hint(transcript: str) -> Optional[str]:
-    """Return the first street-level location mention found in the transcript, or None."""
-    match = _STREET_RE.search(transcript)
-    return match.group(0) if match else None
+def _sync_extract(transcript: str) -> dict:
+    """Call Gemini Flash and parse the JSON response."""
+    from app.config import settings
+    import google.generativeai as genai
+
+    if not settings.gemini_api_key:
+        logger.warning("GEMINI_API_KEY not set — intelligence extraction disabled.")
+        return {}
+
+    genai.configure(api_key=settings.gemini_api_key)
+    model = genai.GenerativeModel(
+        "gemini-1.5-flash",
+        generation_config={"response_mime_type": "application/json"},
+    )
+
+    try:
+        response = model.generate_content(_PROMPT_TEMPLATE.format(transcript=transcript))
+        return json.loads(response.text)
+    except json.JSONDecodeError as e:
+        logger.warning(f"Gemini returned non-JSON: {e}")
+        return {}
+    except Exception as e:
+        logger.warning(f"Gemini extraction failed: {e}")
+        return {}
+
+
+def _sync_embed(text: str) -> Optional[list[float]]:
+    """Generate a text-embedding-004 vector for semantic similarity."""
+    from app.config import settings
+    import google.generativeai as genai
+
+    if not settings.gemini_api_key:
+        return None
+
+    genai.configure(api_key=settings.gemini_api_key)
+    try:
+        result = genai.embed_content(
+            model="models/text-embedding-004",
+            content=text,
+            task_type="SEMANTIC_SIMILARITY",
+        )
+        return result["embedding"]
+    except Exception as e:
+        logger.warning(f"Embedding generation failed: {e}")
+        return None
+
+
+def _embed_text(transcript: str, incident_type: Optional[str]) -> str:
+    """Build the text string to embed — transcript + type context."""
+    prefix = f"[{incident_type}] " if incident_type else ""
+    return f"{prefix}{transcript}"