147 lines
2.8 KiB
Python
147 lines
2.8 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from typing import Literal
|
|
from uuid import UUID
|
|
|
|
from cpv3.common.schemas import Schema
|
|
|
|
|
|
TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD"]
|
|
|
|
|
|
class TranscriptionRead(Schema):
|
|
id: UUID
|
|
project_id: UUID | None
|
|
source_file_id: UUID
|
|
artifact_id: UUID | None
|
|
|
|
engine: TranscriptionEngineEnum
|
|
language: str | None
|
|
|
|
document: dict
|
|
transcribe_options: dict | None
|
|
|
|
is_active: bool
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
|
|
class TranscriptionCreate(Schema):
|
|
project_id: UUID | None = None
|
|
source_file_id: UUID
|
|
artifact_id: UUID | None = None
|
|
|
|
engine: TranscriptionEngineEnum = "LOCAL_WHISPER"
|
|
language: str | None = None
|
|
|
|
document: dict
|
|
transcribe_options: dict | None = None
|
|
|
|
|
|
class TranscriptionUpdate(Schema):
|
|
document: dict | None = None
|
|
transcribe_options: dict | None = None
|
|
|
|
|
|
# ---------------------------------- Document ----------------------------------
|
|
class Tag(Schema):
|
|
name: str
|
|
|
|
|
|
class TimeRange(Schema):
|
|
start: float
|
|
end: float
|
|
|
|
|
|
class WordNode(Schema):
|
|
text: str
|
|
semantic_tags: list[Tag]
|
|
structure_tags: list[Tag]
|
|
time: TimeRange
|
|
|
|
|
|
class LineNode(Schema):
|
|
text: str
|
|
semantic_tags: list[Tag]
|
|
structure_tags: list[Tag]
|
|
time: TimeRange
|
|
words: list[WordNode]
|
|
|
|
|
|
class SegmentNode(Schema):
|
|
text: str
|
|
semantic_tags: list[Tag]
|
|
structure_tags: list[Tag]
|
|
time: TimeRange
|
|
lines: list[LineNode]
|
|
|
|
|
|
class Document(Schema):
|
|
segments: list[SegmentNode]
|
|
|
|
|
|
class WordOptions(Schema):
|
|
highlight_words: bool = False
|
|
max_line_width: int = 32
|
|
max_line_count: int = 2
|
|
|
|
|
|
# ---------------------------------- Whisper Models ----------------------------------
|
|
class WhisperWord(Schema):
|
|
word: str
|
|
start: float
|
|
end: float
|
|
probability: float
|
|
|
|
|
|
class WhisperSegment(Schema):
|
|
id: int
|
|
seek: int
|
|
start: float
|
|
end: float
|
|
text: str
|
|
tokens: list[int]
|
|
temperature: float
|
|
avg_logprob: float
|
|
compression_ratio: float
|
|
no_speech_prob: float
|
|
words: list[WhisperWord]
|
|
|
|
|
|
class WhisperResult(Schema):
|
|
text: str
|
|
segments: list[WhisperSegment]
|
|
language: str
|
|
|
|
|
|
class WhisperParams(Schema):
|
|
file_path: str
|
|
language: str | None = None
|
|
model_name: str = "tiny"
|
|
|
|
|
|
# ---------------------------------- Google Speech Models ----------------------------------
|
|
class GoogleSpeechWord(Schema):
|
|
word: str
|
|
start: float
|
|
end: float
|
|
|
|
|
|
class GoogleSpeechSegment(Schema):
|
|
text: str
|
|
start: float
|
|
end: float
|
|
words: list[GoogleSpeechWord]
|
|
|
|
|
|
class GoogleSpeechResult(Schema):
|
|
text: str
|
|
segments: list[GoogleSpeechSegment]
|
|
language: str
|
|
|
|
|
|
class GoogleSpeechParams(Schema):
|
|
file_path: str
|
|
language_codes: list[str] | None = None
|