From 32f4059ae6dc26e06b7ddd84ae106e792da4bbcf Mon Sep 17 00:00:00 2001 From: Daniil Date: Fri, 3 Apr 2026 23:47:58 +0300 Subject: [PATCH] docs: add SaluteSpeech transcription engine spec and implementation plan Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-04-03-salutespeech-transcription.md | 1213 +++++++++++++++++ ...04-03-salutespeech-transcription-design.md | 410 ++++++ 2 files changed, 1623 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-03-salutespeech-transcription.md create mode 100644 docs/superpowers/specs/2026-04-03-salutespeech-transcription-design.md diff --git a/docs/superpowers/plans/2026-04-03-salutespeech-transcription.md b/docs/superpowers/plans/2026-04-03-salutespeech-transcription.md new file mode 100644 index 0000000..5a628a9 --- /dev/null +++ b/docs/superpowers/plans/2026-04-03-salutespeech-transcription.md @@ -0,0 +1,1213 @@ +# SaluteSpeech Transcription Engine — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add SaluteSpeech (Sber) as a third transcription engine with async REST API, domain-specific models, and word-level timestamps. + +**Architecture:** Direct integration following existing engine pattern — plain functions in `transcription/service.py`, `if/elif` dispatch in Dramatiq actor, no new abstractions. SaluteSpeech uses a 4-step REST flow (auth → upload → create task → poll → download) with a thread-safe OAuth token cache. + +**Tech Stack:** Python, httpx (sync), FastAPI, Dramatiq, React/TypeScript + +**Spec:** `docs/superpowers/specs/2026-04-03-salutespeech-transcription-design.md` + +--- + +## File Map + +| File | Action | Responsibility | +|------|--------|----------------| +| `cofee_backend/.certs/russian_trusted_root_ca.pem` | Create | Russian CA certificate for TLS | +| `cofee_backend/cpv3/infrastructure/settings.py` | Modify | 3 new SaluteSpeech settings fields | +| `cofee_backend/cpv3/modules/transcription/schemas.py` | Modify | New schema types, extend engine enum + type unions | +| `cofee_backend/cpv3/modules/transcription/service.py` | Modify | ~8 new functions for SaluteSpeech flow | +| `cofee_backend/cpv3/modules/transcription/router.py` | Modify | Direct `/salute-speech/` endpoint | +| `cofee_backend/cpv3/modules/tasks/schemas.py` | Modify | Extend engine Literal | +| `cofee_backend/cpv3/modules/tasks/service.py` | Modify | ENGINE_MAP + elif dispatch branch | +| `cofee_frontend/src/features/project/TranscriptionModal/TranscriptionModal.tsx` | Modify | Engine option, split model options, engine change effect | +| `cofee_frontend/src/features/project/TranscriptionSettingsStep/TranscriptionSettingsStep.tsx` | Modify | Same as TranscriptionModal | +| `cofee_backend/tests/integration/test_salutespeech_parsing.py` | Create | Unit tests for timestamp parsing + result conversion | + +--- + +### Task 1: Bundle TLS Certificate + +**Files:** +- Create: `cofee_backend/.certs/russian_trusted_root_ca.pem` + +- [ ] **Step 1: Create `.certs` directory** + +```bash +mkdir -p cofee_backend/.certs +``` + +- [ ] **Step 2: Download the Russian root CA certificate** + +```bash +curl -k "https://gu-st.ru/content/Other/doc/russian_trusted_root_ca.cer" \ + -o cofee_backend/.certs/russian_trusted_root_ca.pem +``` + +- [ ] **Step 3: Verify the cert is valid PEM format** + +```bash +openssl x509 -in cofee_backend/.certs/russian_trusted_root_ca.pem -noout -subject -dates +``` + +Expected: prints subject (Russian CA) and validity dates without errors. If the file is DER format instead of PEM, convert: + +```bash +openssl x509 -inform DER -in cofee_backend/.certs/russian_trusted_root_ca.pem \ + -out cofee_backend/.certs/russian_trusted_root_ca.pem -outform PEM +``` + +- [ ] **Step 4: Add to `.gitignore` exclusion** + +The `.certs/` directory should NOT be gitignored — this is a public root CA, safe to commit. Verify it's not caught by any existing gitignore pattern: + +```bash +cd cofee_backend && git check-ignore .certs/russian_trusted_root_ca.pem +``` + +Expected: no output (not ignored). + +- [ ] **Step 5: Commit** + +```bash +git add cofee_backend/.certs/russian_trusted_root_ca.pem +git commit -m "chore(backend): bundle Russian root CA cert for SaluteSpeech TLS" +``` + +--- + +### Task 2: Add SaluteSpeech Settings + +**Files:** +- Modify: `cofee_backend/cpv3/infrastructure/settings.py:97` (after `webhook_base_url` field) + +- [ ] **Step 1: Add 3 new fields to Settings class** + +In `cofee_backend/cpv3/infrastructure/settings.py`, after the `webhook_base_url` field (line 97) and before `def get_database_url(self)` (line 99), add: + +```python + # SaluteSpeech + salute_auth_key: str = Field(default="", alias="SALUTE_AUTH_KEY") + salute_ca_cert_path: Path | None = Field( + default=None, alias="SALUTE_CA_CERT_PATH" + ) + salute_scope: str = Field( + default="SALUTE_SPEECH_PERS", alias="SALUTE_SCOPE" + ) +``` + +- [ ] **Step 2: Verify settings load without errors** + +```bash +cd cofee_backend && uv run python -c "from cpv3.infrastructure.settings import get_settings; s = get_settings(); print(f'salute_auth_key={s.salute_auth_key!r}, salute_ca_cert_path={s.salute_ca_cert_path!r}, salute_scope={s.salute_scope!r}')" +``` + +Expected: `salute_auth_key='', salute_ca_cert_path=None, salute_scope='SALUTE_SPEECH_PERS'` + +- [ ] **Step 3: Commit** + +```bash +git add cofee_backend/cpv3/infrastructure/settings.py +git commit -m "feat(backend): add SaluteSpeech settings (auth key, cert path, scope)" +``` + +--- + +### Task 3: Add SaluteSpeech Schemas + +**Files:** +- Modify: `cofee_backend/cpv3/modules/transcription/schemas.py:10` (engine enum) and after line 147 (EOF, new classes) + +- [ ] **Step 1: Extend `TranscriptionEngineEnum`** + +In `cofee_backend/cpv3/modules/transcription/schemas.py`, line 10, change: + +```python +TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD"] +``` + +to: + +```python +TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD", "SALUTE_SPEECH"] +``` + +- [ ] **Step 2: Add SaluteSpeech schema classes** + +After the `GoogleSpeechParams` class (line 147, end of file), add: + +```python + + +# ---------------------------------- SaluteSpeech Models ---------------------------------- + + +class SaluteSpeechWord(Schema): + word: str + start: float + end: float + + +class SaluteSpeechSegment(Schema): + text: str + start: float + end: float + words: list[SaluteSpeechWord] = [] + + +class SaluteSpeechResult(Schema): + text: str + segments: list[SaluteSpeechSegment] + language: str + + +class SaluteSpeechParams(Schema): + file_path: str + language: str | None = None + model: str = "general" +``` + +- [ ] **Step 3: Verify schemas import correctly** + +```bash +cd cofee_backend && uv run python -c "from cpv3.modules.transcription.schemas import SaluteSpeechWord, SaluteSpeechSegment, SaluteSpeechResult, SaluteSpeechParams, TranscriptionEngineEnum; print('OK')" +``` + +Expected: `OK` + +- [ ] **Step 4: Commit** + +```bash +git add cofee_backend/cpv3/modules/transcription/schemas.py +git commit -m "feat(backend): add SaluteSpeech schema types and extend engine enum" +``` + +--- + +### Task 4: Extend Type Unions in Service + +**Files:** +- Modify: `cofee_backend/cpv3/modules/transcription/service.py:44` and `service.py:222` (type unions) +- Modify: `cofee_backend/cpv3/modules/transcription/service.py` imports (top of file) + +- [ ] **Step 1: Add SaluteSpeech imports** + +In `cofee_backend/cpv3/modules/transcription/service.py`, in the imports from `transcription.schemas` (around line 229–243), add `SaluteSpeechSegment` to the import list: + +```python +from cpv3.modules.transcription.schemas import ( + Document, + GoogleSpeechResult, + GoogleSpeechSegment, + GoogleSpeechWord, + LineNode, + SaluteSpeechSegment, + SegmentNode, + Tag, + TimeRange, + WhisperResult, + WhisperSegment, + WhisperWord, + WordNode, + WordOptions, +) +``` + +- [ ] **Step 2: Extend `compute_segment_lines` type hint** + +At line 44, change: + +```python + def compute_segment_lines( + self, segment: WhisperSegment | GoogleSpeechSegment, max_chars_per_line: int + ) -> list[LineNode]: +``` + +to: + +```python + def compute_segment_lines( + self, + segment: WhisperSegment | GoogleSpeechSegment | SaluteSpeechSegment, + max_chars_per_line: int, + ) -> list[LineNode]: +``` + +- [ ] **Step 3: Extend `_make_document_from_segments` type hint** + +At line 222, change: + +```python +def _make_document_from_segments( + builder: DocumentBuilder, + segments: list[WhisperSegment] | list[GoogleSpeechSegment], + *, + max_line_width: int, +) -> Document: +``` + +to: + +```python +def _make_document_from_segments( + builder: DocumentBuilder, + segments: list[WhisperSegment] | list[GoogleSpeechSegment] | list[SaluteSpeechSegment], + *, + max_line_width: int, +) -> Document: +``` + +- [ ] **Step 4: Run lint to verify** + +```bash +cd cofee_backend && uv run ruff check cpv3/modules/transcription/service.py +``` + +Expected: no errors. + +- [ ] **Step 5: Commit** + +```bash +git add cofee_backend/cpv3/modules/transcription/service.py +git commit -m "feat(backend): extend type unions to accept SaluteSpeechSegment" +``` + +--- + +### Task 5: Write Tests for SaluteSpeech Parsing + +**Files:** +- Create: `cofee_backend/tests/integration/test_salutespeech_parsing.py` + +- [ ] **Step 1: Write the test file** + +Create `cofee_backend/tests/integration/test_salutespeech_parsing.py`: + +```python +"""Tests for SaluteSpeech result parsing and document building.""" + +from cpv3.modules.transcription.service import ( + _build_document_from_salute_result, + _parse_salute_time, +) + + +class TestParseSaluteTime: + def test_simple_timestamp(self): + assert _parse_salute_time("0.480s") == 0.48 + + def test_zero(self): + assert _parse_salute_time("0.000s") == 0.0 + + def test_large_timestamp(self): + assert _parse_salute_time("123.456s") == 123.456 + + def test_integer_timestamp(self): + assert _parse_salute_time("5s") == 5.0 + + +class TestBuildDocumentFromSaluteResult: + def _make_raw_result(self): + """Minimal SaluteSpeech API response for testing.""" + return [ + { + "results": [ + { + "text": "привет мир", + "normalized_text": "Привет мир.", + "start": "0.480s", + "end": "1.200s", + "word_alignments": [ + {"word": "привет", "start": "0.480s", "end": "0.840s"}, + {"word": "мир", "start": "0.960s", "end": "1.200s"}, + ], + }, + { + "text": "это тест", + "normalized_text": "Это тест.", + "start": "1.500s", + "end": "2.100s", + "word_alignments": [ + {"word": "это", "start": "1.500s", "end": "1.700s"}, + {"word": "тест", "start": "1.800s", "end": "2.100s"}, + ], + }, + ], + "channel": 0, + } + ] + + def test_returns_document_with_segments(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 2 + + def test_segment_text(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert doc.segments[0].lines[0].text == "привет мир" + + def test_word_timestamps(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + first_word = doc.segments[0].lines[0].words[0] + assert first_word.text == "привет" + assert first_word.time.start == 0.48 + assert first_word.time.end == 0.84 + + def test_segment_time_range(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert doc.segments[0].time.start == 0.48 + assert doc.segments[0].time.end == 1.2 + + def test_empty_results(self): + raw = [{"results": [], "channel": 0}] + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 0 + + def test_missing_word_alignments(self): + raw = [ + { + "results": [ + { + "text": "привет", + "normalized_text": "Привет.", + "start": "0.000s", + "end": "0.500s", + } + ], + "channel": 0, + } + ] + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 1 + # No words but segment still created + assert doc.segments[0].time.start == 0.0 +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cd cofee_backend && uv run pytest tests/integration/test_salutespeech_parsing.py -v 2>&1 | head -20 +``` + +Expected: `ImportError` — `_build_document_from_salute_result` and `_parse_salute_time` don't exist yet. + +- [ ] **Step 3: Commit test file** + +```bash +git add cofee_backend/tests/integration/test_salutespeech_parsing.py +git commit -m "test(backend): add SaluteSpeech parsing and document building tests" +``` + +--- + +### Task 6: Implement SaluteSpeech Service Functions + +**Files:** +- Modify: `cofee_backend/cpv3/modules/transcription/service.py` (append after line 430) + +This is the core task — all 8 SaluteSpeech functions. + +- [ ] **Step 1: Add new imports at top of file** + +In `cofee_backend/cpv3/modules/transcription/service.py`, add these imports at the top (after the existing imports, around line 10): + +```python +import threading +import time +import uuid +from pathlib import Path + +import httpx +``` + +Note: `time` may already be imported. Check and avoid duplicates. `asyncio` is already imported. `anyio` is already imported. + +Also add to the schema imports block: + +```python +from cpv3.modules.transcription.schemas import ( + ... # existing imports + SaluteSpeechResult, + SaluteSpeechSegment, + SaluteSpeechWord, + SaluteSpeechParams, +) +``` + +- [ ] **Step 2: Add constants and token cache** + +After the existing imports (before the `DocumentBuilder` class), add: + +```python +# ---------------------------------- SaluteSpeech Constants ---------------------------------- + +SALUTE_AUTH_URL = "https://ngw.devices.sberbank.ru:9443/api/v2/oauth" +SALUTE_API_BASE = "https://smartspeech.sber.ru/rest/v1" +SALUTE_POLL_INTERVAL_SECONDS = 5.0 +SALUTE_POLL_TIMEOUT_SECONDS = 600 +SALUTE_TOKEN_REFRESH_MARGIN_SECONDS = 60 + +SALUTE_ENCODING_MAP: dict[str, str] = { + ".mp3": "MP3", + ".wav": "PCM_S16LE", + ".ogg": "opus", + ".flac": "FLAC", +} + +SALUTE_CONTENT_TYPE_MAP: dict[str, str] = { + ".mp3": "audio/mpeg", + ".wav": "audio/wav", + ".ogg": "audio/ogg", + ".flac": "audio/flac", +} + +SALUTE_LANGUAGE_MAP: dict[str, str] = { + "ru": "ru-RU", + "en": "en-US", +} + +ERROR_SALUTE_AUTH_FAILED = "Ошибка авторизации SaluteSpeech: {detail}" +ERROR_SALUTE_UPLOAD_FAILED = "Ошибка загрузки файла в SaluteSpeech: {detail}" +ERROR_SALUTE_TASK_FAILED = "Ошибка распознавания SaluteSpeech: {detail}" +ERROR_SALUTE_TIMEOUT = "Превышено время ожидания распознавания SaluteSpeech" +ERROR_SALUTE_UNSUPPORTED_FORMAT = "Неподдерживаемый формат аудио для SaluteSpeech: {ext}" + +_salute_token_lock = threading.Lock() +_salute_token: str | None = None +_salute_token_expires_at: float = 0.0 +``` + +- [ ] **Step 3: Add helper functions** + +After the end of file (after `transcribe_with_google_speech`), append all SaluteSpeech functions: + +```python +# ---------------------------------- SaluteSpeech Engine ---------------------------------- + + +def _parse_salute_time(s: str) -> float: + """Parse SaluteSpeech timestamp string '0.480s' → 0.48.""" + return float(s.rstrip("s")) + + +def _get_salute_access_token(client: httpx.Client) -> str: + """Get or refresh SaluteSpeech OAuth token. Thread-safe.""" + global _salute_token, _salute_token_expires_at + with _salute_token_lock: + if _salute_token and time.monotonic() < ( + _salute_token_expires_at - SALUTE_TOKEN_REFRESH_MARGIN_SECONDS + ): + return _salute_token + + settings = get_settings() + response = client.post( + SALUTE_AUTH_URL, + headers={ + "Authorization": f"Basic {settings.salute_auth_key}", + "RqUID": str(uuid.uuid4()), + "Content-Type": "application/x-www-form-urlencoded", + }, + content=f"scope={settings.salute_scope}", + ) + if response.status_code != 200: + raise RuntimeError( + ERROR_SALUTE_AUTH_FAILED.format(detail=response.text[:200]) + ) + data = response.json() + _salute_token = data["access_token"] + expires_in_seconds = (data["expires_at"] / 1000) - time.time() + _salute_token_expires_at = time.monotonic() + expires_in_seconds + return _salute_token + + +def _upload_salute_audio( + client: httpx.Client, token: str, audio_data: bytes, content_type: str +) -> str: + """Upload audio to SaluteSpeech, return request_file_id.""" + response = client.post( + f"{SALUTE_API_BASE}/data:upload", + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": content_type, + }, + content=audio_data, + timeout=120.0, + ) + if response.status_code != 200: + raise RuntimeError( + ERROR_SALUTE_UPLOAD_FAILED.format(detail=response.text[:200]) + ) + return response.json()["result"]["request_file_id"] + + +def _create_salute_task( + client: httpx.Client, + token: str, + file_id: str, + *, + language: str, + model: str, + audio_encoding: str, + sample_rate: int, +) -> str: + """Create async recognition task, return task_id.""" + body = { + "options": { + "audio_encoding": audio_encoding, + "sample_rate": sample_rate, + "language": language, + "model": model, + "channels_count": 1, + "hypotheses_count": 1, + }, + "request_file_id": file_id, + } + response = client.post( + f"{SALUTE_API_BASE}/speech:async_recognize", + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + }, + json=body, + ) + if response.status_code != 200: + raise RuntimeError( + ERROR_SALUTE_TASK_FAILED.format(detail=response.text[:200]) + ) + return response.json()["result"]["id"] + + +def _poll_salute_task( + client: httpx.Client, + token: str, + task_id: str, + job_uuid: uuid.UUID | None, + on_progress: ProgressCallback | None, +) -> str: + """Poll task until DONE, return response_file_id. Checks job cancellation each iteration.""" + from cpv3.modules.tasks.service import _raise_if_job_cancelled + + start = time.monotonic() + while True: + elapsed = time.monotonic() - start + if elapsed > SALUTE_POLL_TIMEOUT_SECONDS: + raise TimeoutError(ERROR_SALUTE_TIMEOUT) + + if job_uuid is not None: + _raise_if_job_cancelled(job_uuid) + + response = client.get( + f"{SALUTE_API_BASE}/task:get", + params={"id": task_id}, + headers={"Authorization": f"Bearer {token}"}, + ) + response.raise_for_status() + result = response.json()["result"] + status = result["status"] + + if status == "DONE": + return result["response_file_id"] + if status == "ERROR": + error_msg = result.get("error", "unknown error") + raise RuntimeError( + ERROR_SALUTE_TASK_FAILED.format(detail=error_msg) + ) + + if on_progress is not None: + pct = min(elapsed / SALUTE_POLL_TIMEOUT_SECONDS * 100, 95.0) + on_progress(pct) + + time.sleep(SALUTE_POLL_INTERVAL_SECONDS) + + +def _download_salute_result( + client: httpx.Client, token: str, response_file_id: str +) -> list[dict]: + """Download recognition result JSON.""" + response = client.get( + f"{SALUTE_API_BASE}/data:download", + params={"response_file_id": response_file_id}, + headers={"Authorization": f"Bearer {token}"}, + timeout=60.0, + ) + response.raise_for_status() + return response.json() + + +def _build_document_from_salute_result( + raw_channels: list[dict], *, language: str +) -> Document: + """Convert SaluteSpeech result JSON to Document.""" + builder = DocumentBuilder() + words_options = WordOptions() + + all_segments: list[SaluteSpeechSegment] = [] + + for channel_data in raw_channels: + for result_item in channel_data.get("results", []): + word_alignments = result_item.get("word_alignments", []) + words = [ + SaluteSpeechWord( + word=w["word"], + start=_parse_salute_time(w["start"]), + end=_parse_salute_time(w["end"]), + ) + for w in word_alignments + ] + + text = result_item.get("text", "") + seg_start = _parse_salute_time(result_item["start"]) if words else 0.0 + seg_end = _parse_salute_time(result_item["end"]) if words else 0.0 + + all_segments.append( + SaluteSpeechSegment( + text=text, + start=seg_start, + end=seg_end, + words=words, + ) + ) + + document = _make_document_from_segments( + builder, all_segments, max_line_width=words_options.max_line_width + ) + return builder.process_document(document) + + +def _salute_transcribe_sync( + *, + local_file_path: str, + language: str | None, + model: str, + sample_rate: int, + job_id: uuid.UUID | None = None, + on_progress: ProgressCallback | None = None, +) -> Document: + """Synchronous SaluteSpeech transcription (runs in Dramatiq worker thread).""" + settings = get_settings() + + ext = Path(local_file_path).suffix.lower() + audio_encoding = SALUTE_ENCODING_MAP.get(ext) + content_type = SALUTE_CONTENT_TYPE_MAP.get(ext) + if not audio_encoding or not content_type: + raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext)) + + salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU") + + verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True + with httpx.Client(verify=verify, timeout=30.0) as client: + token = _get_salute_access_token(client) + + with open(local_file_path, "rb") as f: + audio_data = f.read() + + file_id = _upload_salute_audio(client, token, audio_data, content_type) + task_id = _create_salute_task( + client, + token, + file_id, + language=salute_language, + model=model, + audio_encoding=audio_encoding, + sample_rate=sample_rate, + ) + response_file_id = _poll_salute_task( + client, token, task_id, job_id, on_progress + ) + raw_result = _download_salute_result(client, token, response_file_id) + + return _build_document_from_salute_result(raw_result, language=salute_language) + + +async def transcribe_with_salute_speech( + storage: StorageService, + *, + file_key: str, + language: str | None = None, + model: str = "general", + sample_rate: int = 16000, + job_id: uuid.UUID | None = None, + on_progress: ProgressCallback | None = None, +) -> Document: + """Async wrapper for SaluteSpeech transcription.""" + tmp = await storage.download_to_temp(file_key) + try: + return await anyio.to_thread.run_sync( + lambda: _salute_transcribe_sync( + local_file_path=tmp.path, + language=language, + model=model, + sample_rate=sample_rate, + job_id=job_id, + on_progress=on_progress, + ) + ) + finally: + tmp.cleanup() +``` + +- [ ] **Step 4: Run the parsing tests** + +```bash +cd cofee_backend && uv run pytest tests/integration/test_salutespeech_parsing.py -v +``` + +Expected: all tests pass. + +- [ ] **Step 5: Run lint** + +```bash +cd cofee_backend && uv run ruff check cpv3/modules/transcription/service.py +``` + +Expected: no errors. + +- [ ] **Step 6: Commit** + +```bash +git add cofee_backend/cpv3/modules/transcription/service.py +git commit -m "feat(backend): implement SaluteSpeech transcription engine (8 functions)" +``` + +--- + +### Task 7: Add Task Dispatch + +**Files:** +- Modify: `cofee_backend/cpv3/modules/tasks/schemas.py:86` (engine Literal) +- Modify: `cofee_backend/cpv3/modules/tasks/service.py:88-91` (ENGINE_MAP) +- Modify: `cofee_backend/cpv3/modules/tasks/service.py:613-616` (actor import) +- Modify: `cofee_backend/cpv3/modules/tasks/service.py:700` (elif branch) + +- [ ] **Step 1: Extend engine Literal in task schema** + +In `cofee_backend/cpv3/modules/tasks/schemas.py`, line 86, change: + +```python + engine: Literal["whisper", "google"] = Field( +``` + +to: + +```python + engine: Literal["whisper", "google", "salutespeech"] = Field( +``` + +- [ ] **Step 2: Add to ENGINE_MAP** + +In `cofee_backend/cpv3/modules/tasks/service.py`, lines 88-91, change: + +```python +ENGINE_MAP: dict[str, str] = { + "whisper": "LOCAL_WHISPER", + "google": "GOOGLE_SPEECH_CLOUD", +} +``` + +to: + +```python +ENGINE_MAP: dict[str, str] = { + "whisper": "LOCAL_WHISPER", + "google": "GOOGLE_SPEECH_CLOUD", + "salutespeech": "SALUTE_SPEECH", +} +``` + +- [ ] **Step 3: Add import in actor** + +In `cofee_backend/cpv3/modules/tasks/service.py`, inside `transcription_generate_actor` (lines 613-616), change: + +```python + from cpv3.modules.transcription.service import ( + transcribe_with_google_speech, + transcribe_with_whisper, + ) +``` + +to: + +```python + from cpv3.modules.transcription.service import ( + transcribe_with_google_speech, + transcribe_with_salute_speech, + transcribe_with_whisper, + ) +``` + +- [ ] **Step 4: Add elif dispatch branch** + +In `cofee_backend/cpv3/modules/tasks/service.py`, after the Google branch (after line 700, before the `else:`), add: + +```python + elif engine == "salutespeech": + # Extract sample rate from probe if available + audio_stream = next( + (s for s in probe.streams if s.codec_type == "audio"), None + ) + sr = int(audio_stream.sample_rate) if audio_stream and audio_stream.sample_rate else 16000 + document = _run_async( + transcribe_with_salute_speech( + storage, + file_key=file_key, + language=language, + model=model, + sample_rate=sr, + job_id=job_uuid, + on_progress=_on_whisper_progress, + ) + ) +``` + +- [ ] **Step 5: Run lint** + +```bash +cd cofee_backend && uv run ruff check cpv3/modules/tasks/service.py cpv3/modules/tasks/schemas.py +``` + +Expected: no errors. + +- [ ] **Step 6: Commit** + +```bash +git add cofee_backend/cpv3/modules/tasks/schemas.py cofee_backend/cpv3/modules/tasks/service.py +git commit -m "feat(backend): add SaluteSpeech to task dispatch (ENGINE_MAP + elif branch)" +``` + +--- + +### Task 8: Add Direct Endpoint (Optional) + +**Files:** +- Modify: `cofee_backend/cpv3/modules/transcription/router.py` (after line 145) + +- [ ] **Step 1: Add route** + +In `cofee_backend/cpv3/modules/transcription/router.py`, add the import at the top alongside existing imports: + +```python +from cpv3.modules.transcription.schemas import ( + ... # existing + SaluteSpeechParams, +) +from cpv3.modules.transcription.service import ( + ... # existing + transcribe_with_salute_speech, +) +``` + +Then append after the last endpoint (after line 145): + +```python + + +@router.post("/salute-speech/", response_model=Document) +async def salute_speech_transcribe( + body: SaluteSpeechParams, + current_user: User = Depends(get_current_user), + storage: StorageService = Depends(get_storage), +) -> Document: + _ = current_user + return await transcribe_with_salute_speech( + storage, + file_key=body.file_path, + language=body.language, + model=body.model, + ) +``` + +- [ ] **Step 2: Run lint** + +```bash +cd cofee_backend && uv run ruff check cpv3/modules/transcription/router.py +``` + +Expected: no errors. + +- [ ] **Step 3: Commit** + +```bash +git add cofee_backend/cpv3/modules/transcription/router.py +git commit -m "feat(backend): add direct /salute-speech/ transcription endpoint" +``` + +--- + +### Task 9: Frontend — TranscriptionModal + +**Files:** +- Modify: `cofee_frontend/src/features/project/TranscriptionModal/TranscriptionModal.tsx` + +- [ ] **Step 1: Extend type** + +At line 17, change: + +```typescript + engine: "whisper" | "google" +``` + +to: + +```typescript + engine: "whisper" | "google" | "salutespeech" +``` + +- [ ] **Step 2: Add engine option** + +At lines 22-25, change: + +```typescript +const ENGINE_OPTIONS = [ + { value: "whisper", label: "Whisper (локальный)" }, + { value: "google", label: "Google Speech" }, +] +``` + +to: + +```typescript +const ENGINE_OPTIONS = [ + { value: "whisper", label: "Whisper (локальный)" }, + { value: "google", label: "Google Speech" }, + { value: "salutespeech", label: "SaluteSpeech" }, +] +``` + +- [ ] **Step 3: Split model options** + +Rename the existing `MODEL_OPTIONS` (lines 33-38) and add SaluteSpeech models: + +```typescript +const WHISPER_MODEL_OPTIONS = [ + { value: "base", label: "Базовая" }, + { value: "small", label: "Малая" }, + { value: "medium", label: "Средняя" }, + { value: "large", label: "Большая" }, +] + +const SALUTE_MODEL_OPTIONS = [ + { value: "general", label: "Общая" }, + { value: "finance", label: "Финансы" }, + { value: "medicine", label: "Медицина" }, +] +``` + +- [ ] **Step 4: Update model dropdown guard** + +At line 162, change the model dropdown conditional from: + +```typescript +{engine === "whisper" && ( +``` + +to: + +```typescript +{(engine === "whisper" || engine === "salutespeech") && ( +``` + +And inside, change the options reference from `MODEL_OPTIONS` to: + +```typescript +{(engine === "whisper" ? WHISPER_MODEL_OPTIONS : SALUTE_MODEL_OPTIONS).map((opt) => ( +``` + +- [ ] **Step 5: Add model reset on engine change** + +Find the component function body (after the `useForm` call). Add a `useEffect` that resets the model when engine changes: + +```typescript +const engine = watch("engine") + +useEffect(() => { + if (engine === "salutespeech") { + setValue("model", "general") + } else if (engine === "whisper") { + setValue("model", "base") + } +}, [engine, setValue]) +``` + +Note: `watch` and `setValue` come from `useForm` — check that they're destructured. If `watch("engine")` is already used elsewhere, reuse that variable. + +- [ ] **Step 6: Type check** + +```bash +cd cofee_frontend && bunx tsc --noEmit 2>&1 | grep -v "app/template.tsx" | grep -v "CreateProjectModal" | head -20 +``` + +Expected: no new errors. + +- [ ] **Step 7: Commit** + +```bash +git add cofee_frontend/src/features/project/TranscriptionModal/TranscriptionModal.tsx +git commit -m "feat(frontend): add SaluteSpeech engine option to TranscriptionModal" +``` + +--- + +### Task 10: Frontend — TranscriptionSettingsStep + +**Files:** +- Modify: `cofee_frontend/src/features/project/TranscriptionSettingsStep/TranscriptionSettingsStep.tsx` + +Apply the same changes as Task 9 to this file (constants are duplicated). + +- [ ] **Step 1: Extend type** + +At line 22, change: + +```typescript + engine: "whisper" | "google" +``` + +to: + +```typescript + engine: "whisper" | "google" | "salutespeech" +``` + +- [ ] **Step 2: Add engine option** + +At lines 27-30, change: + +```typescript +const ENGINE_OPTIONS = [ + { value: "whisper", label: "Whisper (локальный)" }, + { value: "google", label: "Google Speech" }, +] +``` + +to: + +```typescript +const ENGINE_OPTIONS = [ + { value: "whisper", label: "Whisper (локальный)" }, + { value: "google", label: "Google Speech" }, + { value: "salutespeech", label: "SaluteSpeech" }, +] +``` + +- [ ] **Step 3: Split model options** + +Rename `MODEL_OPTIONS` (lines 38-43) and add SaluteSpeech models: + +```typescript +const WHISPER_MODEL_OPTIONS = [ + { value: "base", label: "Базовая" }, + { value: "small", label: "Малая" }, + { value: "medium", label: "Средняя" }, + { value: "large", label: "Большая" }, +] + +const SALUTE_MODEL_OPTIONS = [ + { value: "general", label: "Общая" }, + { value: "finance", label: "Финансы" }, + { value: "medicine", label: "Медицина" }, +] +``` + +- [ ] **Step 4: Update model dropdown guard** + +At line 263, change: + +```typescript +{engine === "whisper" && ( +``` + +to: + +```typescript +{(engine === "whisper" || engine === "salutespeech") && ( +``` + +And change the options reference from `MODEL_OPTIONS` to: + +```typescript +{(engine === "whisper" ? WHISPER_MODEL_OPTIONS : SALUTE_MODEL_OPTIONS).map((opt) => ( +``` + +- [ ] **Step 5: Add model reset on engine change** + +Same `useEffect` as Task 9: + +```typescript +const engine = watch("engine") + +useEffect(() => { + if (engine === "salutespeech") { + setValue("model", "general") + } else if (engine === "whisper") { + setValue("model", "base") + } +}, [engine, setValue]) +``` + +- [ ] **Step 6: Type check** + +```bash +cd cofee_frontend && bunx tsc --noEmit 2>&1 | grep -v "app/template.tsx" | grep -v "CreateProjectModal" | head -20 +``` + +Expected: no new errors. + +- [ ] **Step 7: Commit** + +```bash +git add cofee_frontend/src/features/project/TranscriptionSettingsStep/TranscriptionSettingsStep.tsx +git commit -m "feat(frontend): add SaluteSpeech engine option to TranscriptionSettingsStep" +``` + +--- + +### Task 11: Final Verification + +**Files:** None (verification only) + +- [ ] **Step 1: Backend lint** + +```bash +cd cofee_backend && uv run ruff check cpv3/ 2>&1 | head -20 +``` + +Expected: no errors. + +- [ ] **Step 2: Backend tests** + +```bash +cd cofee_backend && uv run pytest 2>&1 | tail -30 +``` + +Expected: all tests pass (including new SaluteSpeech parsing tests). + +- [ ] **Step 3: Frontend type check** + +```bash +cd cofee_frontend && bunx tsc --noEmit 2>&1 | grep -v "app/template.tsx" | grep -v "CreateProjectModal" | head -20 +``` + +Expected: no new errors. + +- [ ] **Step 4: Write verification report** + +``` +VERIFICATION REPORT +=================== +Subproject: backend + frontend +Level: base +Type check: [PASS/FAIL] +Lint: [PASS/FAIL] +Tests: [PASS/FAIL] (X passed, Y failed) +Build: SKIPPED +E2E: SKIPPED + +Files changed: ~10 +Status: [READY/NOT READY] +``` diff --git a/docs/superpowers/specs/2026-04-03-salutespeech-transcription-design.md b/docs/superpowers/specs/2026-04-03-salutespeech-transcription-design.md new file mode 100644 index 0000000..572922b --- /dev/null +++ b/docs/superpowers/specs/2026-04-03-salutespeech-transcription-design.md @@ -0,0 +1,410 @@ +# SaluteSpeech Transcription Engine — Design Spec + +**Date:** 2026-04-03 +**Status:** Approved +**Scope:** Backend (primary), Frontend (minor) + +## Overview + +Add SaluteSpeech (Sber) as a third transcription engine alongside Local Whisper and Google Speech Cloud. SaluteSpeech provides async REST-based speech recognition with word-level timestamps, domain-specific models (general/finance/medicine), and supports Russian and English. + +## Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| API protocol | REST (not gRPC) | No gRPC deps in codebase, REST covers full async flow | +| Implementation pattern | Direct integration (Approach A) | Matches existing if/elif dispatch, no new abstractions | +| HTTP client | `httpx` (sync) | Already used in workers (`tasks/service.py:12`) | +| TLS certificates | Bundled PEM in repo, path via Settings | Self-contained, no Dockerfile changes | +| Token caching | Module-level globals + `threading.Lock` | Thread-safe for Dramatiq multi-thread workers, matches existing pattern | +| Token TTL | `time.monotonic()` + actual `expires_at` from response | Avoids clock drift vs hardcoded 30 min | +| Engine short name | `"salutespeech"` | API boundary name, maps to DB `"SALUTE_SPEECH"` | +| SaluteSpeech plan | `SALUTE_SPEECH_PERS` | Personal scope, max 5 parallel streams | +| pip package | None (raw HTTP) | `salute_speech` package is unmaintained | +| Frontend model selector | Shown for SaluteSpeech (general/finance/medicine) | Meaningful differentiator, follows Whisper conditional pattern | + +## SaluteSpeech API Flow + +``` +1. Auth: POST https://ngw.devices.sberbank.ru:9443/api/v2/oauth +2. Upload: POST https://smartspeech.sber.ru/rest/v1/data:upload +3. Task: POST https://smartspeech.sber.ru/rest/v1/speech:async_recognize +4. Poll: GET https://smartspeech.sber.ru/rest/v1/task:get?id= +5. Download: GET https://smartspeech.sber.ru/rest/v1/data:download?response_file_id= +``` + +Token TTL: 30 min (from API response `expires_at`). Refresh when < 60s remaining. +Uploaded files retained 72 hours server-side. +Task statuses: NEW → RUNNING → DONE | ERROR. + +## Backend — Authentication & HTTP Client + +### Token Cache + +Module-level cache with `threading.Lock` for Dramatiq thread safety: + +```python +import threading + +_salute_token_lock = threading.Lock() +_salute_token: str | None = None +_salute_token_expires_at: float = 0.0 # time.monotonic() + +def _get_salute_access_token(client: httpx.Client) -> str: + global _salute_token, _salute_token_expires_at + with _salute_token_lock: + if _salute_token and time.monotonic() < _salute_token_expires_at - SALUTE_TOKEN_REFRESH_MARGIN_SECONDS: + return _salute_token + settings = get_settings() + response = client.post( + SALUTE_AUTH_URL, + headers={ + "Authorization": f"Basic {settings.salute_auth_key}", + "RqUID": str(uuid.uuid4()), + "Content-Type": "application/x-www-form-urlencoded", + }, + content=f"scope={settings.salute_scope}", + ) + response.raise_for_status() + data = response.json() + _salute_token = data["access_token"] + # expires_at is Unix ms; convert to monotonic offset + expires_in_seconds = (data["expires_at"] / 1000) - time.time() + _salute_token_expires_at = time.monotonic() + expires_in_seconds + return _salute_token +``` + +### Settings (3 new fields in `infrastructure/settings.py`) + +```python +# SaluteSpeech +salute_auth_key: str = Field(default="", alias="SALUTE_AUTH_KEY") +salute_ca_cert_path: Path | None = Field(default=None, alias="SALUTE_CA_CERT_PATH") +salute_scope: str = Field(default="SALUTE_SPEECH_PERS", alias="SALUTE_SCOPE") +``` + +- `SALUTE_AUTH_KEY` — base64 Authorization Key from Sber Studio +- `SALUTE_CA_CERT_PATH` — path to bundled Russian CA PEM (e.g., `./.certs/russian_trusted_root_ca.pem`) +- `SALUTE_SCOPE` — OAuth scope (`SALUTE_SPEECH_PERS`) + +### Per-Job httpx Client + +Created in `_salute_transcribe_sync()`, passed to all helpers for connection reuse: + +```python +verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True +with httpx.Client(verify=verify, timeout=30.0) as client: + token = _get_salute_access_token(client) + file_id = _upload_salute_audio(client, token, audio_bytes, content_type) + task_id = _create_salute_task(client, token, file_id, language, model, encoding, sample_rate) + result_file_id = _poll_salute_task(client, token, task_id, job_uuid, on_progress) + raw_result = _download_salute_result(client, token, result_file_id) + return _build_document_from_salute_result(raw_result) +``` + +### Cert File + +Bundled at `cofee_backend/.certs/russian_trusted_root_ca.pem`. Downloaded from `https://gu-st.ru/content/Other/doc/russian_trusted_root_ca.cer`. Only the public root CA — no private keys or secrets. + +## Backend — Transcription Flow & Helpers + +### Function Structure (in `transcription/service.py`) + +``` +_get_salute_access_token(client) → str +_upload_salute_audio(client, token, data, content_type) → str (request_file_id) +_create_salute_task(client, token, file_id, lang, model, ...) → str (task_id) +_poll_salute_task(client, token, task_id, job_uuid, on_prog) → str (response_file_id) +_download_salute_result(client, token, response_file_id) → dict +_parse_salute_time(s: str) → float → "0.480s" → 0.48 +_build_document_from_salute_result(raw: dict) → Document +_salute_transcribe_sync(*, local_file_path, language, model, job_id, on_progress) → Document +async transcribe_with_salute_speech(storage, *, file_key, ...) → Document +``` + +### Upload + +Read local file as bytes, send raw binary to `/data:upload` with appropriate `Content-Type`. No ffmpeg conversion — SaluteSpeech natively supports MP3, WAV, OGG, FLAC. + +### Audio Encoding Detection + +```python +SALUTE_ENCODING_MAP: dict[str, str] = { + ".mp3": "MP3", + ".wav": "PCM_S16LE", + ".ogg": "opus", + ".flac": "FLAC", +} + +SALUTE_CONTENT_TYPE_MAP: dict[str, str] = { + ".mp3": "audio/mpeg", + ".wav": "audio/wav", + ".ogg": "audio/ogg", + ".flac": "audio/flac", +} +``` + +### Create Task + +JSON body with `request_file_id` + options: + +```json +{ + "options": { + "audio_encoding": "MP3", + "sample_rate": 16000, + "language": "ru-RU", + "model": "general", + "channels_count": 1, + "hypotheses_count": 1 + }, + "request_file_id": "" +} +``` + +Language mapping: `"ru"` → `"ru-RU"`, `"en"` → `"en-US"`, `None`/auto → `"ru-RU"` (default). + +`sample_rate` — extracted from probe data (the actor already runs `probe_media()` before transcription). Parse from the audio stream's `sample_rate` field, fallback to `16000`. + +### Poll Loop + +Check every 5 seconds. Three critical additions vs existing engines: + +1. **Cancellation check** — `_raise_if_job_cancelled(job_uuid)` each iteration +2. **Progress reporting** — `on_progress` callback during poll so UI shows activity +3. **Timeout** — `SALUTE_POLL_TIMEOUT_SECONDS = 600` + +```python +def _poll_salute_task(client, token, task_id, job_uuid, on_progress): + start = time.monotonic() + while True: + if time.monotonic() - start > SALUTE_POLL_TIMEOUT_SECONDS: + raise TimeoutError(ERROR_SALUTE_TIMEOUT) + _raise_if_job_cancelled(job_uuid) + + resp = client.get(f"{SALUTE_API_BASE}/task:get", params={"id": task_id}, ...) + status = resp.json()["result"]["status"] + + if status == "DONE": + return resp.json()["result"]["response_file_id"] + if status == "ERROR": + raise RuntimeError(ERROR_SALUTE_TASK_FAILED.format(detail=...)) + + # Progress: estimate based on poll iteration + if on_progress: + elapsed = time.monotonic() - start + on_progress(min(elapsed / SALUTE_POLL_TIMEOUT_SECONDS * 100, 95.0)) + + time.sleep(SALUTE_POLL_INTERVAL_SECONDS) +``` + +### Download & Parse + +Download JSON from `/data:download`. Result structure: + +```json +{ + "results": [{ + "text": "...", + "normalized_text": "...", + "start": "0.480s", + "end": "3.600s", + "word_alignments": [ + {"word": "...", "start": "0.480s", "end": "0.840s"} + ] + }] +} +``` + +Parse into `SaluteSpeechSegment`/`SaluteSpeechWord`, then `_make_document_from_segments()` → `Document`. + +### Constants + +```python +SALUTE_POLL_INTERVAL_SECONDS = 5.0 +SALUTE_POLL_TIMEOUT_SECONDS = 600 +SALUTE_AUTH_URL = "https://ngw.devices.sberbank.ru:9443/api/v2/oauth" +SALUTE_API_BASE = "https://smartspeech.sber.ru/rest/v1" +SALUTE_TOKEN_REFRESH_MARGIN_SECONDS = 60 + +ERROR_SALUTE_AUTH_FAILED = "Ошибка авторизации SaluteSpeech: {detail}" +ERROR_SALUTE_UPLOAD_FAILED = "Ошибка загрузки файла в SaluteSpeech: {detail}" +ERROR_SALUTE_TASK_FAILED = "Ошибка распознавания SaluteSpeech: {detail}" +ERROR_SALUTE_TIMEOUT = "Превышено время ожидания распознавания SaluteSpeech" +``` + +## Backend — Schemas & DB Model + +### New Schemas (in `transcription/schemas.py`) + +```python +class SaluteSpeechWord(Schema): + word: str + start: float + end: float + +class SaluteSpeechSegment(Schema): + text: str + start: float + end: float + words: list[SaluteSpeechWord] = [] + +class SaluteSpeechResult(Schema): + text: str + segments: list[SaluteSpeechSegment] + language: str + +class SaluteSpeechParams(Schema): + file_path: str + language: str | None = None + model: str = "general" +``` + +### Engine Enum + +```python +# transcription/schemas.py +TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD", "SALUTE_SPEECH"] +``` + +### Type Unions + +Extend `_make_document_from_segments()` and `DocumentBuilder.compute_segment_lines()` to accept `SaluteSpeechSegment` in their type unions. + +### DB Model + +No changes. `engine` column is `String(32)`, stores `"SALUTE_SPEECH"` as a plain string. No migration needed. + +## Backend — Task Dispatch + +### ENGINE_MAP (`tasks/service.py`) + +```python +ENGINE_MAP: dict[str, str] = { + "whisper": "LOCAL_WHISPER", + "google": "GOOGLE_SPEECH_CLOUD", + "salutespeech": "SALUTE_SPEECH", +} +``` + +### Task Schema (`tasks/schemas.py`) + +```python +engine: Literal["whisper", "google", "salutespeech"] = "whisper" +``` + +### Actor Dispatch + +New `elif` branch in `transcription_generate_actor` after the Google branch: + +```python +elif engine == "salutespeech": + document = _run_async( + transcribe_with_salute_speech( + storage, + file_key=file_key, + language=language, + model=model, + job_id=job_uuid, + on_progress=_on_whisper_progress, + ) + ) +``` + +### Direct Endpoint (optional, for testing) + +```python +# transcription/router.py +@router.post("/salute-speech/", response_model=Document) +``` + +## Frontend Changes + +### TranscriptionModal.tsx & TranscriptionSettingsStep.tsx + +Both files get identical changes (constants are duplicated in both): + +**Engine options:** +```typescript +const ENGINE_OPTIONS = [ + { value: "whisper", label: "Whisper (локальный)" }, + { value: "google", label: "Google Speech" }, + { value: "salutespeech", label: "SaluteSpeech" }, +] +``` + +**Type:** +```typescript +engine: "whisper" | "google" | "salutespeech" +``` + +**Model options — split by engine:** +```typescript +const WHISPER_MODEL_OPTIONS = [ + { value: "base", label: "Base" }, + { value: "small", label: "Small" }, + { value: "medium", label: "Medium" }, + { value: "large", label: "Large" }, +] + +const SALUTE_MODEL_OPTIONS = [ + { value: "general", label: "Общая" }, + { value: "finance", label: "Финансы" }, + { value: "medicine", label: "Медицина" }, +] +``` + +**Conditional model dropdown:** +```typescript +{(engine === "whisper" || engine === "salutespeech") && ( +