server-26/drb-c2-core/app/internal/vocabulary_learner.py

"""
Per-system vocabulary learning for STT accuracy improvement.

Three mechanisms:
1. Bootstrap  — one-shot GPT-4o call generates local knowledge at system setup:
                agencies + abbreviations, unit naming, streets, acronyms.
2. Correction — diffs admin transcript edits, extracts corrected tokens → vocabulary.
3. Induction  — background loop samples N tokens of transcripts per system,
                asks GPT-4o-mini to propose new terms → queued as pending for review.

Firestore schema additions on system documents:
  vocabulary:            list[str]   — approved terms; injected into Whisper + GPT prompts
  vocabulary_pending:    list[dict]  — induction suggestions awaiting admin review
                                       each: {term, source, added_at}
  vocabulary_bootstrapped: bool      — bootstrap has been run at least once
"""
import asyncio
import difflib
import json
import random
from datetime import datetime, timezone
from typing import Optional
from app.internal.logger import logger
from app.internal import firestore as fstore
from app.config import settings


# ─────────────────────────────────────────────────────────────────────────────
# Prompt templates
# ─────────────────────────────────────────────────────────────────────────────

_BOOTSTRAP_PROMPT = """\
You are building a radio vocabulary dictionary to improve speech-to-text accuracy for a P25 \
public-safety radio monitoring system in a specific area. The STT model has no local knowledge, \
so common terms like "YVAC" get transcribed as "why vac", "5-baker" as "5 acre", etc.

System name: {system_name}
System type: {system_type}
Area context: {area_hint}

Return ONLY a JSON object:
{{"vocabulary": [list of strings]}}

Include terms you are confident about for this area:
- Agency names and their radio abbreviations (e.g. "YVAC" = Yorktown Volunteer Ambulance Corps)
- Unit ID examples using the local naming convention (e.g. "5-baker", "5-charlie", "1-david";
  many departments use APCO phonetics: adam, baker, charles, david, edward, frank, george,
  henry, ida, john, king, lincoln, mary, nora, ocean, paul, queen, robert, sam, tom, union,
  victor, william, x-ray, young, zebra)
- Major routes, roads, and key intersections
- Local landmarks and geographic references dispatchers use
- Agency-specific codes that differ from standard APCO

Return a flat list of strings — abbreviations, proper names, unit IDs, street names.
Do NOT include common English words. Max 80 terms. Only include what you are confident is \
accurate for this specific area; return fewer terms rather than guessing."""

_INDUCTION_PROMPT = """\
You are analyzing P25 emergency radio transcripts to find vocabulary terms that should be \
added to improve future speech-to-text accuracy for this system.

System: {system_name}
Existing approved vocabulary (do not re-propose these): {existing_vocab}

Sampled transcripts:
{transcript_block}

Find terms that are LIKELY STT errors or local terms missing from the vocabulary:
- Unit IDs that appear garbled (e.g. "5 acre" → "5-baker")
- Agency acronyms spelled out phonetically (e.g. "why vac" → "YVAC")
- Street names or locations that look misspelled or oddly transcribed
- Callsigns or local codes not yet in the vocabulary

Return ONLY a JSON object:
{{"new_terms": ["term1", "term2", ...]}}

Only include high-confidence additions not already in existing vocabulary.
Return {{"new_terms": []}} if nothing new is found."""


# ─────────────────────────────────────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────────────────────────────────────

async def bootstrap_system_vocabulary(system_id: str) -> list[str]:
    """
    One-shot GPT-4o bootstrap: generate local-knowledge vocabulary for a system.
    Merges generated terms into system.vocabulary and sets vocabulary_bootstrapped=True.
    Returns the list of newly generated terms.
    """
    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        logger.warning(f"Vocabulary bootstrap: system {system_id} not found")
        return []

    system_name = system_doc.get("name", "Unknown")
    system_type = system_doc.get("type", "P25")

    # Build area hint from configured talkgroup names
    talkgroups = system_doc.get("config", {}).get("talkgroups", [])
    tg_names = [tg.get("name", "") for tg in talkgroups if tg.get("name")][:8]
    area_hint = f"Talkgroups include: {', '.join(tg_names)}" if tg_names else "Unknown area"

    terms = await asyncio.to_thread(_sync_bootstrap, system_name, system_type, area_hint)
    if not terms:
        return []

    existing = system_doc.get("vocabulary") or []
    existing_lower = {t.lower() for t in existing}
    to_add = [t for t in terms if t.lower() not in existing_lower]
    merged = list(dict.fromkeys(existing + to_add))

    await fstore.doc_set("systems", system_id, {
        "vocabulary": merged,
        "vocabulary_bootstrapped": True,
    })
    logger.info(
        f"Vocabulary bootstrap: {len(to_add)} term(s) generated for system {system_id} "
        f"({system_name})"
    )
    return to_add


async def learn_from_correction(system_id: str, original: str, corrected: str) -> None:
    """
    Diff original and corrected transcripts; append new tokens to the approved vocabulary.
    Called automatically when an admin saves a transcript correction.
    """
    if not system_id or not original or not corrected:
        return

    new_terms = _diff_new_terms(original, corrected)
    if not new_terms:
        return

    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        return

    existing = system_doc.get("vocabulary") or []
    existing_lower = {t.lower() for t in existing}
    to_add = [t for t in new_terms if t.lower() not in existing_lower]
    if not to_add:
        return

    merged = list(dict.fromkeys(existing + to_add))
    await fstore.doc_set("systems", system_id, {"vocabulary": merged})
    logger.info(
        f"Vocabulary: learned {len(to_add)} term(s) from correction on system {system_id}: "
        f"{to_add}"
    )


async def approve_pending_term(system_id: str, term: str) -> None:
    """Move a pending term into the approved vocabulary."""
    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        return
    pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
    vocab = system_doc.get("vocabulary") or []
    if term.lower() not in {t.lower() for t in vocab}:
        vocab = list(dict.fromkeys(vocab + [term]))
    await fstore.doc_set("systems", system_id, {
        "vocabulary": vocab,
        "vocabulary_pending": pending,
    })


async def dismiss_pending_term(system_id: str, term: str) -> None:
    """Remove a pending term without adding it to vocabulary."""
    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        return
    pending = [p for p in (system_doc.get("vocabulary_pending") or []) if p["term"] != term]
    await fstore.doc_set("systems", system_id, {"vocabulary_pending": pending})


async def add_term(system_id: str, term: str) -> None:
    """Manually add a term to the approved vocabulary."""
    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        return
    vocab = system_doc.get("vocabulary") or []
    if term.lower() not in {t.lower() for t in vocab}:
        vocab = list(dict.fromkeys(vocab + [term.strip()]))
        await fstore.doc_set("systems", system_id, {"vocabulary": vocab})


async def remove_term(system_id: str, term: str) -> None:
    """Remove a term from the approved vocabulary."""
    system_doc = await fstore.doc_get("systems", system_id)
    if not system_doc:
        return
    vocab = [t for t in (system_doc.get("vocabulary") or []) if t.lower() != term.lower()]
    await fstore.doc_set("systems", system_id, {"vocabulary": vocab})


async def get_vocabulary(system_id: str) -> dict:
    """Return vocabulary and pending terms for a system (TTL-cached, 5 min)."""
    doc = await fstore.doc_get_cached("systems", system_id)
    if not doc:
        return {"vocabulary": [], "vocabulary_pending": [], "vocabulary_bootstrapped": False}
    return {
        "vocabulary":             doc.get("vocabulary") or [],
        "vocabulary_pending":     doc.get("vocabulary_pending") or [],
        "vocabulary_bootstrapped": doc.get("vocabulary_bootstrapped", False),
    }


# ─────────────────────────────────────────────────────────────────────────────
# Prompt-injection helpers (called by transcription.py and intelligence.py)
# ─────────────────────────────────────────────────────────────────────────────

def build_whisper_vocab_prompt(vocabulary: list[str]) -> str:
    """
    Format vocabulary for Whisper prompt injection.
    Whisper's prompt field acts as a context prior with a ~224-token limit.
    The base _WHISPER_PROMPT uses ~70 tokens; we budget ~150 tokens (≈550 chars) here.
    """
    if not vocabulary:
        return ""
    char_budget = 550
    terms: list[str] = []
    used = 0
    for term in vocabulary:
        cost = len(term) + 2  # ", "
        if used + cost > char_budget:
            break
        terms.append(term)
        used += cost
    return ", ".join(terms) + ". " if terms else ""


def build_gpt_vocab_block(vocabulary: list[str]) -> str:
    """Format vocabulary for injection into GPT extraction prompts."""
    if not vocabulary:
        return ""
    return f"Known local terms: {', '.join(vocabulary)}\n"


# ─────────────────────────────────────────────────────────────────────────────
# Background induction loop
# ─────────────────────────────────────────────────────────────────────────────

async def vocabulary_induction_loop() -> None:
    from app.internal.feature_flags import get_flags
    interval = settings.vocabulary_induction_interval_hours * 3600
    logger.info(
        f"Vocabulary induction loop started — "
        f"interval: {settings.vocabulary_induction_interval_hours}h, "
        f"sample budget: {settings.vocabulary_induction_sample_tokens} tokens"
    )
    while True:
        await asyncio.sleep(interval)
        try:
            flags = await get_flags()
            if flags["vocabulary_learning_enabled"]:
                await _run_induction_pass()
            else:
                logger.info("Vocabulary learning disabled — skipping induction pass")
        except Exception as e:
            logger.error(f"Vocabulary induction pass failed: {e}")


async def _run_induction_pass() -> None:
    systems = await fstore.collection_list("systems")
    if not systems:
        return
    logger.info(f"Vocabulary induction: processing {len(systems)} system(s)")
    for system in systems:
        system_id = system.get("system_id")
        if system_id:
            try:
                await _induct_system(system_id, system)
            except Exception as e:
                logger.warning(f"Induction failed for system {system_id}: {e}")


async def _induct_system(system_id: str, system_doc: dict) -> None:
    """Sample random transcripts for a system and propose new vocabulary."""
    system_name   = system_doc.get("name", "Unknown")
    existing_vocab: list[str] = system_doc.get("vocabulary") or []

    # Fetch calls from the last 7 days only — avoids scanning the entire history.
    # Active calls have ended_at=None and are excluded by the range filter automatically.
    # Needs a composite index on (system_id ASC, ended_at ASC).
    cutoff = datetime.now(timezone.utc) - timedelta(days=7)
    all_calls = await fstore.collection_where("calls", [
        ("system_id", "==", system_id),
        ("ended_at",  ">=", cutoff),
    ])
    if not all_calls:
        return

    # Random sample up to the token budget (4 chars ≈ 1 token)
    random.shuffle(all_calls)
    char_budget = settings.vocabulary_induction_sample_tokens * 4
    transcript_block = ""
    sampled = 0
    for call in all_calls:
        text = call.get("transcript_corrected") or call.get("transcript") or ""
        if not text:
            continue
        if len(transcript_block) + len(text) > char_budget:
            break
        tg = call.get("talkgroup_name") or f"TGID {call.get('talkgroup_id', '?')}"
        transcript_block += f"[{tg}] {text}\n"
        sampled += 1

    if sampled < 3:
        return  # not enough data to learn from yet

    new_terms = await asyncio.to_thread(
        _sync_induct, system_name, existing_vocab, transcript_block
    )
    if not new_terms:
        return

    now = datetime.now(timezone.utc).isoformat()
    existing_pending: list[dict] = system_doc.get("vocabulary_pending") or []
    pending_lower = {p["term"].lower() for p in existing_pending}
    vocab_lower   = {t.lower() for t in existing_vocab}

    to_queue = [
        {"term": t, "source": "induction", "added_at": now}
        for t in new_terms
        if t.lower() not in vocab_lower and t.lower() not in pending_lower
    ]
    if not to_queue:
        return

    await fstore.doc_set("systems", system_id, {
        "vocabulary_pending": existing_pending + to_queue,
    })
    logger.info(
        f"Vocabulary induction: {len(to_queue)} new term(s) proposed for "
        f"system {system_id} ({system_name}): {[p['term'] for p in to_queue]}"
    )


# ─────────────────────────────────────────────────────────────────────────────
# Internal sync helpers
# ─────────────────────────────────────────────────────────────────────────────

_STOP_WORDS = {
    "the", "and", "for", "are", "was", "were", "this", "that", "with",
    "have", "has", "had", "but", "not", "from", "they", "will", "what",
    "can", "all", "been", "one", "two", "three", "four", "five", "six",
    "you", "out", "who", "get", "her", "him", "his", "its", "our", "my",
    "via", "per", "any", "now", "got", "she", "let", "did", "may", "yes",
    "sir", "say", "see", "too", "off", "how", "put", "set", "try", "back",
    "just", "like", "into", "than", "them", "then", "some", "also", "onto",
    "went", "over", "copy", "okay", "unit", "post", "road", "lane", "going",
    "being", "doing", "there", "their", "about", "would", "could", "should",
    "route", "north", "south", "east", "west", "avenue", "street", "drive",
}


def _diff_new_terms(original: str, corrected: str) -> list[str]:
    """
    Token-level diff: find tokens in `corrected` that replaced or were inserted
    relative to `original`. These are the admin's intended spellings — good
    candidates for vocabulary.
    """
    orig_tokens = original.split()
    corr_tokens = corrected.split()

    matcher = difflib.SequenceMatcher(None,
        [t.lower() for t in orig_tokens],
        [t.lower() for t in corr_tokens],
    )
    new_terms: list[str] = []
    for tag, _i1, _i2, j1, j2 in matcher.get_opcodes():
        if tag in ("insert", "replace"):
            for tok in corr_tokens[j1:j2]:
                clean = tok.strip(".,!?;:()'\"").strip("-")
                if len(clean) >= 3 and clean.lower() not in _STOP_WORDS:
                    new_terms.append(clean)

    return list(dict.fromkeys(new_terms))


def _sync_bootstrap(system_name: str, system_type: str, area_hint: str) -> list[str]:
    from app.config import settings as cfg
    from openai import OpenAI

    if not cfg.openai_api_key:
        return []

    prompt = _BOOTSTRAP_PROMPT.format(
        system_name=system_name,
        system_type=system_type,
        area_hint=area_hint,
    )
    try:
        client = OpenAI(api_key=cfg.openai_api_key)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},
        )
        data = json.loads(response.choices[0].message.content)
        terms = data.get("vocabulary") or []
        return [str(t).strip() for t in terms if str(t).strip()]
    except Exception as e:
        logger.warning(f"Vocabulary bootstrap GPT call failed: {e}")
        return []


def _sync_induct(
    system_name: str, existing_vocab: list[str], transcript_block: str
) -> list[str]:
    from app.config import settings as cfg
    from openai import OpenAI

    if not cfg.openai_api_key:
        return []

    vocab_str = ", ".join(existing_vocab[:80]) if existing_vocab else "(none yet)"
    prompt = _INDUCTION_PROMPT.format(
        system_name=system_name,
        existing_vocab=vocab_str,
        transcript_block=transcript_block[:8000],
    )
    try:
        client = OpenAI(api_key=cfg.openai_api_key)
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},
        )
        data = json.loads(response.choices[0].message.content)
        terms = data.get("new_terms") or []
        return [str(t).strip() for t in terms if str(t).strip()]
    except Exception as e:
        logger.warning(f"Vocabulary induction GPT call failed: {e}")
        return []