fix(backend): convert unsupported formats to WAV before SaluteSpeech upload
MP4/webm/m4a files are now auto-converted to WAV (PCM_S16LE) via ffmpeg before uploading to SaluteSpeech API. Follows the same pattern as Google Speech's _convert_local_to_ogg. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -671,6 +671,33 @@ def _build_document_from_salute_result(
|
|||||||
return builder.process_document(document)
|
return builder.process_document(document)
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_to_wav_sync(input_path: str, sample_rate: int = 16000) -> tuple[str, Callable[[], None]]:
|
||||||
|
"""Convert any audio/video to WAV (PCM signed 16-bit LE) using ffmpeg. Sync version."""
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
with NamedTemporaryFile(suffix=".wav", delete=False) as out:
|
||||||
|
out_path = out.name
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"ffmpeg", "-y", "-i", input_path,
|
||||||
|
"-vn", "-ac", "1", "-ar", str(sample_rate),
|
||||||
|
"-acodec", "pcm_s16le",
|
||||||
|
out_path,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode(errors='ignore')}")
|
||||||
|
|
||||||
|
def _cleanup() -> None:
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
os.remove(out_path)
|
||||||
|
|
||||||
|
return out_path, _cleanup
|
||||||
|
|
||||||
|
|
||||||
def _salute_transcribe_sync(
|
def _salute_transcribe_sync(
|
||||||
*,
|
*,
|
||||||
local_file_path: str,
|
local_file_path: str,
|
||||||
@@ -686,34 +713,44 @@ def _salute_transcribe_sync(
|
|||||||
ext = Path(local_file_path).suffix.lower()
|
ext = Path(local_file_path).suffix.lower()
|
||||||
audio_encoding = SALUTE_ENCODING_MAP.get(ext)
|
audio_encoding = SALUTE_ENCODING_MAP.get(ext)
|
||||||
content_type = SALUTE_CONTENT_TYPE_MAP.get(ext)
|
content_type = SALUTE_CONTENT_TYPE_MAP.get(ext)
|
||||||
|
|
||||||
|
# Convert unsupported formats (mp4, webm, m4a, etc.) to WAV via ffmpeg
|
||||||
|
cleanup_fn: Callable[[], None] | None = None
|
||||||
if not audio_encoding or not content_type:
|
if not audio_encoding or not content_type:
|
||||||
raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext))
|
wav_path, cleanup_fn = _convert_to_wav_sync(local_file_path, sample_rate)
|
||||||
|
local_file_path = wav_path
|
||||||
|
audio_encoding = "PCM_S16LE"
|
||||||
|
content_type = "audio/wav"
|
||||||
|
|
||||||
salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU")
|
salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU")
|
||||||
|
|
||||||
verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
|
try:
|
||||||
with httpx.Client(verify=verify, timeout=30.0) as client:
|
verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True
|
||||||
token = _get_salute_access_token(client)
|
with httpx.Client(verify=verify, timeout=30.0) as client:
|
||||||
|
token = _get_salute_access_token(client)
|
||||||
|
|
||||||
with open(local_file_path, "rb") as f:
|
with open(local_file_path, "rb") as f:
|
||||||
audio_data = f.read()
|
audio_data = f.read()
|
||||||
|
|
||||||
file_id = _upload_salute_audio(client, token, audio_data, content_type)
|
file_id = _upload_salute_audio(client, token, audio_data, content_type)
|
||||||
task_id = _create_salute_task(
|
task_id = _create_salute_task(
|
||||||
client,
|
client,
|
||||||
token,
|
token,
|
||||||
file_id,
|
file_id,
|
||||||
language=salute_language,
|
language=salute_language,
|
||||||
model=model,
|
model=model,
|
||||||
audio_encoding=audio_encoding,
|
audio_encoding=audio_encoding,
|
||||||
sample_rate=sample_rate,
|
sample_rate=sample_rate,
|
||||||
)
|
)
|
||||||
response_file_id = _poll_salute_task(
|
response_file_id = _poll_salute_task(
|
||||||
client, token, task_id, job_id, on_progress
|
client, token, task_id, job_id, on_progress
|
||||||
)
|
)
|
||||||
raw_result = _download_salute_result(client, token, response_file_id)
|
raw_result = _download_salute_result(client, token, response_file_id)
|
||||||
|
|
||||||
return _build_document_from_salute_result(raw_result, language=salute_language)
|
return _build_document_from_salute_result(raw_result, language=salute_language)
|
||||||
|
finally:
|
||||||
|
if cleanup_fn is not None:
|
||||||
|
cleanup_fn()
|
||||||
|
|
||||||
|
|
||||||
async def transcribe_with_salute_speech(
|
async def transcribe_with_salute_speech(
|
||||||
|
|||||||
Reference in New Issue
Block a user