main_backend/cpv3/modules/transcription/schemas.py

from __future__ import annotations

from datetime import datetime
from typing import Literal
from uuid import UUID

from cpv3.common.schemas import Schema


TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD"]


class TranscriptionRead(Schema):
    id: UUID
    project_id: UUID | None
    source_file_id: UUID
    artifact_id: UUID | None

    engine: TranscriptionEngineEnum
    language: str | None

    document: dict
    transcribe_options: dict | None

    is_active: bool
    created_at: datetime
    updated_at: datetime


class TranscriptionCreate(Schema):
    project_id: UUID | None = None
    source_file_id: UUID
    artifact_id: UUID | None = None

    engine: TranscriptionEngineEnum = "LOCAL_WHISPER"
    language: str | None = None

    document: dict
    transcribe_options: dict | None = None


class TranscriptionUpdate(Schema):
    document: dict | None = None
    transcribe_options: dict | None = None


# ---------------------------------- Document ----------------------------------
class Tag(Schema):
    name: str


class TimeRange(Schema):
    start: float
    end: float


class WordNode(Schema):
    text: str
    semantic_tags: list[Tag]
    structure_tags: list[Tag]
    time: TimeRange


class LineNode(Schema):
    text: str
    semantic_tags: list[Tag]
    structure_tags: list[Tag]
    time: TimeRange
    words: list[WordNode]


class SegmentNode(Schema):
    text: str
    semantic_tags: list[Tag]
    structure_tags: list[Tag]
    time: TimeRange
    lines: list[LineNode]


class Document(Schema):
    segments: list[SegmentNode]


class WordOptions(Schema):
    highlight_words: bool = False
    max_line_width: int = 32
    max_line_count: int = 2


# ---------------------------------- Whisper Models ----------------------------------
class WhisperWord(Schema):
    word: str
    start: float
    end: float
    probability: float


class WhisperSegment(Schema):
    id: int
    seek: int
    start: float
    end: float
    text: str
    tokens: list[int]
    temperature: float
    avg_logprob: float
    compression_ratio: float
    no_speech_prob: float
    words: list[WhisperWord]


class WhisperResult(Schema):
    text: str
    segments: list[WhisperSegment]
    language: str


class WhisperParams(Schema):
    file_path: str
    language: str | None = None
    model_name: str = "tiny"


# ---------------------------------- Google Speech Models ----------------------------------
class GoogleSpeechWord(Schema):
    word: str
    start: float
    end: float


class GoogleSpeechSegment(Schema):
    text: str
    start: float
    end: float
    words: list[GoogleSpeechWord]


class GoogleSpeechResult(Schema):
    text: str
    segments: list[GoogleSpeechSegment]
    language: str


class GoogleSpeechParams(Schema):
    file_path: str
    language_codes: list[str] | None = None