diff --git a/tests/integration/test_salutespeech_parsing.py b/tests/integration/test_salutespeech_parsing.py new file mode 100644 index 0000000..5b04166 --- /dev/null +++ b/tests/integration/test_salutespeech_parsing.py @@ -0,0 +1,99 @@ +"""Tests for SaluteSpeech result parsing and document building.""" + +from cpv3.modules.transcription.service import ( + _build_document_from_salute_result, + _parse_salute_time, +) + + +class TestParseSaluteTime: + def test_simple_timestamp(self): + assert _parse_salute_time("0.480s") == 0.48 + + def test_zero(self): + assert _parse_salute_time("0.000s") == 0.0 + + def test_large_timestamp(self): + assert _parse_salute_time("123.456s") == 123.456 + + def test_integer_timestamp(self): + assert _parse_salute_time("5s") == 5.0 + + +class TestBuildDocumentFromSaluteResult: + def _make_raw_result(self): + """Minimal SaluteSpeech API response for testing.""" + return [ + { + "results": [ + { + "text": "привет мир", + "normalized_text": "Привет мир.", + "start": "0.480s", + "end": "1.200s", + "word_alignments": [ + {"word": "привет", "start": "0.480s", "end": "0.840s"}, + {"word": "мир", "start": "0.960s", "end": "1.200s"}, + ], + }, + { + "text": "это тест", + "normalized_text": "Это тест.", + "start": "1.500s", + "end": "2.100s", + "word_alignments": [ + {"word": "это", "start": "1.500s", "end": "1.700s"}, + {"word": "тест", "start": "1.800s", "end": "2.100s"}, + ], + }, + ], + "channel": 0, + } + ] + + def test_returns_document_with_segments(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 2 + + def test_segment_text(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert doc.segments[0].lines[0].text == "привет мир" + + def test_word_timestamps(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + first_word = doc.segments[0].lines[0].words[0] + assert first_word.text == "привет" + assert first_word.time.start == 0.48 + assert first_word.time.end == 0.84 + + def test_segment_time_range(self): + raw = self._make_raw_result() + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert doc.segments[0].time.start == 0.48 + assert doc.segments[0].time.end == 1.2 + + def test_empty_results(self): + raw = [{"results": [], "channel": 0}] + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 0 + + def test_missing_word_alignments(self): + raw = [ + { + "results": [ + { + "text": "привет", + "normalized_text": "Привет.", + "start": "0.000s", + "end": "0.500s", + } + ], + "channel": 0, + } + ] + doc = _build_document_from_salute_result(raw, language="ru-RU") + assert len(doc.segments) == 1 + assert doc.segments[0].time.start == 0.0