diff --git a/cpv3/modules/transcription/service.py b/cpv3/modules/transcription/service.py index c8ad8ab..38d5073 100644 --- a/cpv3/modules/transcription/service.py +++ b/cpv3/modules/transcription/service.py @@ -671,6 +671,33 @@ def _build_document_from_salute_result( return builder.process_document(document) +def _convert_to_wav_sync(input_path: str, sample_rate: int = 16000) -> tuple[str, Callable[[], None]]: + """Convert any audio/video to WAV (PCM signed 16-bit LE) using ffmpeg. Sync version.""" + import os + import subprocess + + with NamedTemporaryFile(suffix=".wav", delete=False) as out: + out_path = out.name + + result = subprocess.run( + [ + "ffmpeg", "-y", "-i", input_path, + "-vn", "-ac", "1", "-ar", str(sample_rate), + "-acodec", "pcm_s16le", + out_path, + ], + capture_output=True, + ) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg failed: {result.stderr.decode(errors='ignore')}") + + def _cleanup() -> None: + if os.path.exists(out_path): + os.remove(out_path) + + return out_path, _cleanup + + def _salute_transcribe_sync( *, local_file_path: str, @@ -686,34 +713,44 @@ def _salute_transcribe_sync( ext = Path(local_file_path).suffix.lower() audio_encoding = SALUTE_ENCODING_MAP.get(ext) content_type = SALUTE_CONTENT_TYPE_MAP.get(ext) + + # Convert unsupported formats (mp4, webm, m4a, etc.) to WAV via ffmpeg + cleanup_fn: Callable[[], None] | None = None if not audio_encoding or not content_type: - raise ValueError(ERROR_SALUTE_UNSUPPORTED_FORMAT.format(ext=ext)) + wav_path, cleanup_fn = _convert_to_wav_sync(local_file_path, sample_rate) + local_file_path = wav_path + audio_encoding = "PCM_S16LE" + content_type = "audio/wav" salute_language = SALUTE_LANGUAGE_MAP.get(language or "", "ru-RU") - verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True - with httpx.Client(verify=verify, timeout=30.0) as client: - token = _get_salute_access_token(client) + try: + verify = str(settings.salute_ca_cert_path) if settings.salute_ca_cert_path else True + with httpx.Client(verify=verify, timeout=30.0) as client: + token = _get_salute_access_token(client) - with open(local_file_path, "rb") as f: - audio_data = f.read() + with open(local_file_path, "rb") as f: + audio_data = f.read() - file_id = _upload_salute_audio(client, token, audio_data, content_type) - task_id = _create_salute_task( - client, - token, - file_id, - language=salute_language, - model=model, - audio_encoding=audio_encoding, - sample_rate=sample_rate, - ) - response_file_id = _poll_salute_task( - client, token, task_id, job_id, on_progress - ) - raw_result = _download_salute_result(client, token, response_file_id) + file_id = _upload_salute_audio(client, token, audio_data, content_type) + task_id = _create_salute_task( + client, + token, + file_id, + language=salute_language, + model=model, + audio_encoding=audio_encoding, + sample_rate=sample_rate, + ) + response_file_id = _poll_salute_task( + client, token, task_id, job_id, on_progress + ) + raw_result = _download_salute_result(client, token, response_file_id) - return _build_document_from_salute_result(raw_result, language=salute_language) + return _build_document_from_salute_result(raw_result, language=salute_language) + finally: + if cleanup_fn is not None: + cleanup_fn() async def transcribe_with_salute_speech(