fix(backend): convert unsupported formats to WAV before SaluteSpeech upload

MP4/webm/m4a files are now auto-converted to WAV (PCM_S16LE) via ffmpeg
before uploading to SaluteSpeech API. Follows the same pattern as Google
Speech's _convert_local_to_ogg.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Daniil
2026-04-04 00:29:22 +03:00
parent 7f7db41bc3
commit 269724d553
+38 -1
View File
@@ -671,6 +671,33 @@ def _build_document_from_salute_result(
return builder.process_document(document) return builder.process_document(document)
def _convert_to_wav_sync(input_path: str, sample_rate: int = 16000) -> tuple[str, Callable[[], None]]:
"""Convert any audio/video to WAV (PCM signed 16-bit LE) using ffmpeg. Sync version."""
import os
import subprocess
with NamedTemporaryFile(suffix=".wav", delete=False) as out:
out_path = out.name
result = subprocess.run(
[
"ffmpeg", "-y", "-i", input_path,
"-vn", "-ac", "1", "-ar", str(sample_rate),
"-acodec", "pcm_s16le",
out_path,
],
capture_output=True,
)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode(errors='ignore')}")
def _cleanup() -> None:
if os.path.exists(out_path):
os.remove(out_path)
return out_path, _cleanup
def _salute_transcribe_sync( def _salute_transcribe_sync(
*, *,
local_file_path: str, local_file_path: str,
@@ -686,11 +713,18 @@ def _salute_transcribe_sync(
ext = Path(local_file_path).suffix.lower() ext = Path(local_file_path).suffix.lower()
audio_encoding = SALUTE_ENCODING_MAP.get(ext) audio_encoding = SALUTE_ENCODING_MAP.get(ext)
content_type = SALUTE_CONTENT_TYPE_MAP.get(ext) content_type = SALUTE_CONTENT_TYPE_MAP.get(ext)
# Convert unsupported formats (mp4, webm, m4a, etc.) to WAV via ffmpeg
cleanup_fn: Callable[[], None] | None = None
if not audio_encoding or not content_type: if not audio_encoding or not content_type:
raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext)) wav_path, cleanup_fn = _convert_to_wav_sync(local_file_path, sample_rate)
local_file_path = wav_path
audio_encoding = "PCM_S16LE"
content_type = "audio/wav"
salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU") salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU")
try:
verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
with httpx.Client(verify=verify, timeout=30.0) as client: with httpx.Client(verify=verify, timeout=30.0) as client:
token = _get_salute_access_token(client) token = _get_salute_access_token(client)
@@ -714,6 +748,9 @@ def _salute_transcribe_sync(
raw_result = _download_salute_result(client, token, response_file_id) raw_result = _download_salute_result(client, token, response_file_id)
return _build_document_from_salute_result(raw_result, language=salute_language) return _build_document_from_salute_result(raw_result, language=salute_language)
finally:
if cleanup_fn is not None:
cleanup_fn()
async def transcribe_with_salute_speech( async def transcribe_with_salute_speech(