feat(backend): implement SaluteSpeech transcription engine

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 00:04:49 +03:00
parent a8881e29be
commit 2c9c11fa17
1 changed files with 311 additions and 0 deletions
@@ -1,10 +1,16 @@
 from __future__ import annotations
 import asyncio
 import logging
 import threading
 import time
 import uuid
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from typing import Callable, cast
 import anyio
 import httpx
 from cpv3.infrastructure.settings import get_settings
 from cpv3.infrastructure.storage.base import StorageService
@@ -29,6 +35,7 @@ from cpv3.modules.transcription.schemas import (
    GoogleSpeechWord,
    LineNode,
    SaluteSpeechSegment,
    SaluteSpeechWord,
    SegmentNode,
    Tag,
    TimeRange,
@@ -40,6 +47,46 @@ from cpv3.modules.transcription.schemas import (
 )
 # ---------------------------------- SaluteSpeech Constants ----------------------------------
 SALUTE_AUTH_URL = "https://ngw.devices.sberbank.ru:9443/api/v2/oauth"
 SALUTE_API_BASE = "https://smartspeech.sber.ru/rest/v1"
 SALUTE_POLL_INTERVAL_SECONDS = 5.0
 SALUTE_POLL_TIMEOUT_SECONDS = 600
 SALUTE_TOKEN_REFRESH_MARGIN_SECONDS = 60
 SALUTE_ENCODING_MAP: dict[str, str] = {
    ".mp3": "MP3",
    ".wav": "PCM_S16LE",
    ".ogg": "opus",
    ".flac": "FLAC",
 }
 SALUTE_CONTENT_TYPE_MAP: dict[str, str] = {
    ".mp3": "audio/mpeg",
    ".wav": "audio/wav",
    ".ogg": "audio/ogg",
    ".flac": "audio/flac",
 }
 SALUTE_LANGUAGE_MAP: dict[str, str] = {
    "ru": "ru-RU",
    "en": "en-US",
 }
 ERROR_SALUTE_AUTH_FAILED = "Ошибка авторизации SaluteSpeech: {detail}"
 ERROR_SALUTE_UPLOAD_FAILED = "Ошибка загрузки файла в SaluteSpeech: {detail}"
 ERROR_SALUTE_TASK_FAILED = "Ошибка распознавания SaluteSpeech: {detail}"
 ERROR_SALUTE_TIMEOUT = "Превышено время ожидания распознавания SaluteSpeech"
 ERROR_SALUTE_UNSUPPORTED_FORMAT = "Неподдерживаемый формат аудио для SaluteSpeech: {ext}"
 _salute_token_lock = threading.Lock()
 _salute_token: str | None = None
 _salute_token_expires_at: float = 0.0
 logger = logging.getLogger(__name__)
 class DocumentBuilder:
    def compute_segment_lines(
        self,
@@ -430,3 +477,267 @@ async def transcribe_with_google_speech(
            ogg_cleanup()
    finally:
        input_tmp.cleanup()
 # ---------------------------------- SaluteSpeech Engine ----------------------------------
 def _parse_salute_time(s: str) -> float:
    """Parse SaluteSpeech timestamp string '0.480s' → 0.48."""
    return float(s.rstrip("s"))
 def _get_salute_access_token(client: httpx.Client) -> str:
    """Get or refresh SaluteSpeech OAuth token. Thread-safe."""
    global _salute_token, _salute_token_expires_at
    with _salute_token_lock:
        if _salute_token and time.monotonic() < (
            _salute_token_expires_at - SALUTE_TOKEN_REFRESH_MARGIN_SECONDS
        ):
            return _salute_token
        settings = get_settings()
        response = client.post(
            SALUTE_AUTH_URL,
            headers={
                "Authorization": f"Basic {settings.salute_auth_key}",
                "RqUID": str(uuid.uuid4()),
                "Content-Type": "application/x-www-form-urlencoded",
            },
            content=f"scope={settings.salute_scope}",
        )
        if response.status_code != 200:
            raise RuntimeError(
                ERROR_SALUTE_AUTH_FAILED.format(detail=response.text[:200])
            )
        data = response.json()
        _salute_token = data["access_token"]
        expires_in_seconds = (data["expires_at"] / 1000) - time.time()
        _salute_token_expires_at = time.monotonic() + expires_in_seconds
        return _salute_token
 def _upload_salute_audio(
    client: httpx.Client, token: str, audio_data: bytes, content_type: str
 ) -> str:
    """Upload audio to SaluteSpeech, return request_file_id."""
    response = client.post(
        f"{SALUTE_API_BASE}/data:upload",
        headers={
            "Authorization": f"Bearer {token}",
            "Content-Type": content_type,
        },
        content=audio_data,
        timeout=120.0,
    )
    if response.status_code != 200:
        raise RuntimeError(
            ERROR_SALUTE_UPLOAD_FAILED.format(detail=response.text[:200])
        )
    return response.json()["result"]["request_file_id"]
 def _create_salute_task(
    client: httpx.Client,
    token: str,
    file_id: str,
    *,
    language: str,
    model: str,
    audio_encoding: str,
    sample_rate: int,
 ) -> str:
    """Create async recognition task, return task_id."""
    body = {
        "options": {
            "audio_encoding": audio_encoding,
            "sample_rate": sample_rate,
            "language": language,
            "model": model,
            "channels_count": 1,
            "hypotheses_count": 1,
        },
        "request_file_id": file_id,
    }
    response = client.post(
        f"{SALUTE_API_BASE}/speech:async_recognize",
        headers={
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
        },
        json=body,
    )
    if response.status_code != 200:
        raise RuntimeError(
            ERROR_SALUTE_TASK_FAILED.format(detail=response.text[:200])
        )
    return response.json()["result"]["id"]
 def _poll_salute_task(
    client: httpx.Client,
    token: str,
    task_id: str,
    job_uuid: uuid.UUID | None,
    on_progress: ProgressCallback | None,
 ) -> str:
    """Poll task until DONE, return response_file_id."""
    start = time.monotonic()
    while True:
        elapsed = time.monotonic() - start
        if elapsed > SALUTE_POLL_TIMEOUT_SECONDS:
            raise TimeoutError(ERROR_SALUTE_TIMEOUT)
        if job_uuid is not None:
            from cpv3.modules.tasks.service import _raise_if_job_cancelled
            _raise_if_job_cancelled(job_uuid)
        response = client.get(
            f"{SALUTE_API_BASE}/task:get",
            params={"id": task_id},
            headers={"Authorization": f"Bearer {token}"},
        )
        response.raise_for_status()
        result = response.json()["result"]
        status = result["status"]
        if status == "DONE":
            return result["response_file_id"]
        if status == "ERROR":
            error_msg = result.get("error", "unknown error")
            raise RuntimeError(
                ERROR_SALUTE_TASK_FAILED.format(detail=error_msg)
            )
        if on_progress is not None:
            pct = min(elapsed / SALUTE_POLL_TIMEOUT_SECONDS * 100, 95.0)
            on_progress(pct)
        time.sleep(SALUTE_POLL_INTERVAL_SECONDS)
 def _download_salute_result(
    client: httpx.Client, token: str, response_file_id: str
 ) -> list[dict]:
    """Download recognition result JSON."""
    response = client.get(
        f"{SALUTE_API_BASE}/data:download",
        params={"response_file_id": response_file_id},
        headers={"Authorization": f"Bearer {token}"},
        timeout=60.0,
    )
    response.raise_for_status()
    return response.json()
 def _build_document_from_salute_result(
    raw_channels: list[dict], *, language: str
 ) -> Document:
    """Convert SaluteSpeech result JSON to Document."""
    builder = DocumentBuilder()
    words_options = WordOptions()
    all_segments: list[SaluteSpeechSegment] = []
    for channel_data in raw_channels:
        for result_item in channel_data.get("results", []):
            word_alignments = result_item.get("word_alignments", [])
            words = [
                SaluteSpeechWord(
                    word=w["word"],
                    start=_parse_salute_time(w["start"]),
                    end=_parse_salute_time(w["end"]),
                )
                for w in word_alignments
            ]
            text = result_item.get("text", "")
            seg_start = _parse_salute_time(result_item["start"])
            seg_end = _parse_salute_time(result_item["end"])
            all_segments.append(
                SaluteSpeechSegment(
                    text=text,
                    start=seg_start,
                    end=seg_end,
                    words=words,
                )
            )
    document = _make_document_from_segments(
        builder, all_segments, max_line_width=words_options.max_line_width
    )
    return builder.process_document(document)
 def _salute_transcribe_sync(
    *,
    local_file_path: str,
    language: str | None,
    model: str,
    sample_rate: int,
    job_id: uuid.UUID | None = None,
    on_progress: ProgressCallback | None = None,
 ) -> Document:
    """Synchronous SaluteSpeech transcription (runs in Dramatiq worker thread)."""
    settings = get_settings()
    ext = Path(local_file_path).suffix.lower()
    audio_encoding = SALUTE_ENCODING_MAP.get(ext)
    content_type = SALUTE_CONTENT_TYPE_MAP.get(ext)
    if not audio_encoding or not content_type:
        raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext))
    salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU")
    verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
    with httpx.Client(verify=verify, timeout=30.0) as client:
        token = _get_salute_access_token(client)
        with open(local_file_path, "rb") as f:
            audio_data = f.read()
        file_id = _upload_salute_audio(client, token, audio_data, content_type)
        task_id = _create_salute_task(
            client,
            token,
            file_id,
            language=salute_language,
            model=model,
            audio_encoding=audio_encoding,
            sample_rate=sample_rate,
        )
        response_file_id = _poll_salute_task(
            client, token, task_id, job_id, on_progress
        )
        raw_result = _download_salute_result(client, token, response_file_id)
    return _build_document_from_salute_result(raw_result, language=salute_language)
 async def transcribe_with_salute_speech(
    storage: StorageService,
    *,
    file_key: str,
    language: str | None = None,
    model: str = "general",
    sample_rate: int = 16000,
    job_id: uuid.UUID | None = None,
    on_progress: ProgressCallback | None = None,
 ) -> Document:
    """Async wrapper for SaluteSpeech transcription."""
    tmp = await storage.download_to_temp(file_key)
    try:
        return await anyio.to_thread.run_sync(
            lambda: _salute_transcribe_sync(
                local_file_path=tmp.path,
                language=language,
                model=model,
                sample_rate=sample_rate,
                job_id=job_id,
                on_progress=on_progress,
            )
        )
    finally:
        tmp.cleanup()