Start to learn vocab from talkgroups to improve accuracy of STT
This commit is contained in:
@@ -83,6 +83,13 @@ async def patch_transcript(
|
||||
"embedding": None,
|
||||
})
|
||||
|
||||
# Learn from the correction: diff original → corrected and add new tokens to vocabulary
|
||||
system_id = call.get("system_id")
|
||||
original_text = call.get("transcript_corrected") or call.get("transcript") or ""
|
||||
if system_id and original_text:
|
||||
from app.internal.vocabulary_learner import learn_from_correction
|
||||
await learn_from_correction(system_id, original_text, body.transcript)
|
||||
|
||||
from app.routers.upload import _run_extraction_pipeline
|
||||
background_tasks.add_task(
|
||||
_run_extraction_pipeline,
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
import uuid
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from app.models import SystemCreate, SystemRecord
|
||||
from app.internal import firestore as fstore
|
||||
|
||||
router = APIRouter(prefix="/systems", tags=["systems"])
|
||||
|
||||
|
||||
class VocabularyTermBody(BaseModel):
|
||||
term: str
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_systems():
|
||||
return await fstore.collection_list("systems")
|
||||
@@ -42,3 +48,70 @@ async def delete_system(system_id: str):
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
await fstore.doc_delete("systems", system_id)
|
||||
|
||||
|
||||
# ── Vocabulary endpoints ───────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/{system_id}/vocabulary")
|
||||
async def get_vocabulary(system_id: str):
|
||||
"""Return approved vocabulary and pending induction suggestions."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import get_vocabulary as _get
|
||||
return await _get(system_id)
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/bootstrap", status_code=202)
|
||||
async def bootstrap_vocabulary(system_id: str):
|
||||
"""Trigger a one-shot GPT-4o bootstrap to seed the vocabulary from local knowledge."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import bootstrap_system_vocabulary
|
||||
terms = await bootstrap_system_vocabulary(system_id)
|
||||
return {"added": len(terms), "terms": terms}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/terms")
|
||||
async def add_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||
"""Manually add a term to the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import add_term
|
||||
await add_term(system_id, body.term.strip())
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.delete("/{system_id}/vocabulary/terms")
|
||||
async def remove_vocabulary_term(system_id: str, body: VocabularyTermBody):
|
||||
"""Remove a term from the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import remove_term
|
||||
await remove_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/pending/approve")
|
||||
async def approve_pending(system_id: str, body: VocabularyTermBody):
|
||||
"""Move a pending induction suggestion into the approved vocabulary."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import approve_pending_term
|
||||
await approve_pending_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.post("/{system_id}/vocabulary/pending/dismiss")
|
||||
async def dismiss_pending(system_id: str, body: VocabularyTermBody):
|
||||
"""Dismiss a pending induction suggestion without adding it."""
|
||||
existing = await fstore.doc_get("systems", system_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"System '{system_id}' not found.")
|
||||
from app.internal.vocabulary_learner import dismiss_pending_term
|
||||
await dismiss_pending_term(system_id, body.term)
|
||||
return {"ok": True}
|
||||
|
||||
@@ -151,7 +151,9 @@ async def _run_intelligence_pipeline(
|
||||
|
||||
# Step 1: Transcription
|
||||
if gcs_uri:
|
||||
transcript, segments = await transcription.transcribe_call(call_id, gcs_uri, talkgroup_name)
|
||||
transcript, segments = await transcription.transcribe_call(
|
||||
call_id, gcs_uri, talkgroup_name, system_id=system_id
|
||||
)
|
||||
|
||||
# Step 2: Intelligence extraction
|
||||
tags: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user