fix(backend): convert unsupported formats to WAV before SaluteSpeech upload

MP4/webm/m4a files are now auto-converted to WAV (PCM_S16LE) via ffmpeg before uploading to SaluteSpeech API. Follows the same pattern as Google Speech's _convert_local_to_ogg. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 00:29:22 +03:00
parent 7f7db41bc3
commit 269724d553
1 changed files with 58 additions and 21 deletions
@@ -671,6 +671,33 @@ def _build_document_from_salute_result(
    return builder.process_document(document)


+def _convert_to_wav_sync(input_path: str, sample_rate: int = 16000) -> tuple[str, Callable[[], None]]:
+    """Convert any audio/video to WAV (PCM signed 16-bit LE) using ffmpeg. Sync version."""
+    import os
+    import subprocess
+
+    with NamedTemporaryFile(suffix=".wav", delete=False) as out:
+        out_path = out.name
+
+    result = subprocess.run(
+        [
+            "ffmpeg", "-y", "-i", input_path,
+            "-vn", "-ac", "1", "-ar", str(sample_rate),
+            "-acodec", "pcm_s16le",
+            out_path,
+        ],
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed: {result.stderr.decode(errors='ignore')}")
+
+    def _cleanup() -> None:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+    return out_path, _cleanup
+
+
 def _salute_transcribe_sync(
    *,
    local_file_path: str,
@@ -686,34 +713,44 @@ def _salute_transcribe_sync(
    ext = Path(local_file_path).suffix.lower()
    audio_encoding = SALUTE_ENCODING_MAP.get(ext)
    content_type = SALUTE_CONTENT_TYPE_MAP.get(ext)
+
+    # Convert unsupported formats (mp4, webm, m4a, etc.) to WAV via ffmpeg
+    cleanup_fn: Callable[[], None] | None = None
    if not audio_encoding or not content_type:
-        raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext))
+        wav_path, cleanup_fn = _convert_to_wav_sync(local_file_path, sample_rate)
+        local_file_path = wav_path
+        audio_encoding = "PCM_S16LE"
+        content_type = "audio/wav"

    salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU")

-    verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
-    with httpx.Client(verify=verify, timeout=30.0) as client:
-        token = _get_salute_access_token(client)
+    try:
+        verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
+        with httpx.Client(verify=verify, timeout=30.0) as client:
+            token = _get_salute_access_token(client)

-        with open(local_file_path, "rb") as f:
-            audio_data = f.read()
+            with open(local_file_path, "rb") as f:
+                audio_data = f.read()

-        file_id = _upload_salute_audio(client, token, audio_data, content_type)
-        task_id = _create_salute_task(
-            client,
-            token,
-            file_id,
-            language=salute_language,
-            model=model,
-            audio_encoding=audio_encoding,
-            sample_rate=sample_rate,
-        )
-        response_file_id = _poll_salute_task(
-            client, token, task_id, job_id, on_progress
-        )
-        raw_result = _download_salute_result(client, token, response_file_id)
+            file_id = _upload_salute_audio(client, token, audio_data, content_type)
+            task_id = _create_salute_task(
+                client,
+                token,
+                file_id,
+                language=salute_language,
+                model=model,
+                audio_encoding=audio_encoding,
+                sample_rate=sample_rate,
+            )
+            response_file_id = _poll_salute_task(
+                client, token, task_id, job_id, on_progress
+            )
+            raw_result = _download_salute_result(client, token, response_file_id)

-    return _build_document_from_salute_result(raw_result, language=salute_language)
+        return _build_document_from_salute_result(raw_result, language=salute_language)
+    finally:
+        if cleanup_fn is not None:
+            cleanup_fn()


 async def transcribe_with_salute_speech(