Start to learn vocab from talkgroups to improve accuracy of STT

This commit is contained in:
Logan
2026-04-21 22:17:30 -04:00
parent 6612e4b683
commit 338b946ba3
11 changed files with 759 additions and 8 deletions
+7
View File
@@ -83,6 +83,13 @@ async def patch_transcript(
"embedding": None,
})
# Learn from the correction: diff original → corrected and add new tokens to vocabulary
system_id = call.get("system_id")
original_text = call.get("transcript_corrected") or call.get("transcript") or ""
if system_id and original_text:
from app.internal.vocabulary_learner import learn_from_correction
await learn_from_correction(system_id, original_text, body.transcript)
from app.routers.upload import _run_extraction_pipeline
background_tasks.add_task(
_run_extraction_pipeline,
+73
View File
@@ -1,11 +1,17 @@
import uuid
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import Optional
from app.models import SystemCreate, SystemRecord
from app.internal import firestore as fstore
router = APIRouter(prefix="/systems", tags=["systems"])
class VocabularyTermBody(BaseModel):
term: str
@router.get("")
async def list_systems():
return await fstore.collection_list("systems")
@@ -42,3 +48,70 @@ async def delete_system(system_id: str):
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
await fstore.doc_delete("systems", system_id)
# ── Vocabulary endpoints ───────────────────────────────────────────────────────
@router.get("/{system_id}/vocabulary")
async def get_vocabulary(system_id: str):
"""Return approved vocabulary and pending induction suggestions."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import get_vocabulary as _get
return await _get(system_id)
@router.post("/{system_id}/vocabulary/bootstrap", status_code=202)
async def bootstrap_vocabulary(system_id: str):
"""Trigger a one-shot GPT-4o bootstrap to seed the vocabulary from local knowledge."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import bootstrap_system_vocabulary
terms = await bootstrap_system_vocabulary(system_id)
return {"added": len(terms), "terms": terms}
@router.post("/{system_id}/vocabulary/terms")
async def add_vocabulary_term(system_id: str, body: VocabularyTermBody):
"""Manually add a term to the approved vocabulary."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import add_term
await add_term(system_id, body.term.strip())
return {"ok": True}
@router.delete("/{system_id}/vocabulary/terms")
async def remove_vocabulary_term(system_id: str, body: VocabularyTermBody):
"""Remove a term from the approved vocabulary."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import remove_term
await remove_term(system_id, body.term)
return {"ok": True}
@router.post("/{system_id}/vocabulary/pending/approve")
async def approve_pending(system_id: str, body: VocabularyTermBody):
"""Move a pending induction suggestion into the approved vocabulary."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import approve_pending_term
await approve_pending_term(system_id, body.term)
return {"ok": True}
@router.post("/{system_id}/vocabulary/pending/dismiss")
async def dismiss_pending(system_id: str, body: VocabularyTermBody):
"""Dismiss a pending induction suggestion without adding it."""
existing = await fstore.doc_get("systems", system_id)
if not existing:
raise HTTPException(404, f"System '{system_id}' not found.")
from app.internal.vocabulary_learner import dismiss_pending_term
await dismiss_pending_term(system_id, body.term)
return {"ok": True}
+3 -1
View File
@@ -151,7 +151,9 @@ async def _run_intelligence_pipeline(
# Step 1: Transcription
if gcs_uri:
transcript, segments = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name)
transcript, segments = await transcription.transcribe_call(
call_id, gcs_uri, talkgroup_name, system_id=system_id
)
# Step 2: Intelligence extraction
tags: list[str] = []