init: new structure + fix lint errors

This commit is contained in:
Daniil
2026-02-03 02:15:07 +03:00
commit 67e0f22b4f
89 changed files with 7654 additions and 0 deletions
+15
View File
@@ -0,0 +1,15 @@
FIRST_WORD_IN_DOCUMENT = "first-word-in-document"
FIRST_WORD_IN_SEGMENT = "first-word-in-segment"
FIRST_WORD_IN_LINE = "first-word-in-line"
LAST_WORD_IN_DOCUMENT = "last-word-in-document"
LAST_WORD_IN_SEGMENT = "last-word-in-segment"
LAST_WORD_IN_LINE = "last-word-in-line"
FIRST_LINE_IN_DOCUMENT = "first-line-in-document"
FIRST_LINE_IN_SEGMENT = "first-line-in-segment"
LAST_LINE_IN_DOCUMENT = "last-line-in-document"
LAST_LINE_IN_SEGMENT = "last-line-in-segment"
FIRST_SEGMENT_IN_DOCUMENT = "first-segment-in-document"
LAST_SEGMENT_IN_DOCUMENT = "last-segment-in-document"
+35
View File
@@ -0,0 +1,35 @@
from __future__ import annotations
import uuid
from sqlalchemy import JSON, ForeignKey, String
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, mapped_column
from cpv3.db.base import Base, BaseModelMixin
class Transcription(Base, BaseModelMixin):
__tablename__ = "transcriptions"
project_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("projects.id", ondelete="RESTRICT"),
nullable=True,
index=True,
)
source_file_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), ForeignKey("files.id", ondelete="RESTRICT"), index=True
)
artifact_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("artifact_media_files.id", ondelete="RESTRICT"),
nullable=True,
index=True,
)
engine: Mapped[str] = mapped_column(String(32), default="LOCAL_WHISPER")
language: Mapped[str | None] = mapped_column(String(3), nullable=True)
document: Mapped[dict] = mapped_column(JSON)
transcribe_options: Mapped[dict | None] = mapped_column(JSON, nullable=True)
+64
View File
@@ -0,0 +1,64 @@
from __future__ import annotations
import uuid
from sqlalchemy import Select, select
from sqlalchemy.ext.asyncio import AsyncSession
from cpv3.modules.transcription.models import Transcription
from cpv3.modules.transcription.schemas import TranscriptionCreate, TranscriptionUpdate
class TranscriptionRepository:
"""Repository for Transcription database operations."""
def __init__(self, session: AsyncSession) -> None:
self._session = session
async def list_all(self) -> list[Transcription]:
stmt: Select[tuple[Transcription]] = select(Transcription).where(
Transcription.is_active.is_(True)
)
result = await self._session.execute(
stmt.order_by(Transcription.created_at.desc())
)
return list(result.scalars().all())
async def get_by_id(self, transcription_id: uuid.UUID) -> Transcription | None:
result = await self._session.execute(
select(Transcription)
.where(Transcription.id == transcription_id)
.where(Transcription.is_active.is_(True))
)
return result.scalar_one_or_none()
async def create(self, data: TranscriptionCreate) -> Transcription:
transcription = Transcription(
project_id=data.project_id,
source_file_id=data.source_file_id,
artifact_id=data.artifact_id,
engine=data.engine,
language=data.language,
document=data.document,
transcribe_options=data.transcribe_options,
)
self._session.add(transcription)
await self._session.commit()
await self._session.refresh(transcription)
return transcription
async def update(
self, transcription: Transcription, data: TranscriptionUpdate
) -> Transcription:
for key, value in data.model_dump(exclude_unset=True).items():
if value is not None:
setattr(transcription, key, value)
await self._session.commit()
await self._session.refresh(transcription)
return transcription
async def deactivate(self, transcription: Transcription) -> None:
transcription.is_active = False
await self._session.commit()
+129
View File
@@ -0,0 +1,129 @@
from __future__ import annotations
import uuid
from fastapi import APIRouter, Depends, HTTPException, Response, status
from sqlalchemy.ext.asyncio import AsyncSession
from cpv3.infrastructure.auth import get_current_user
from cpv3.infrastructure.deps import get_storage
from cpv3.infrastructure.storage.base import StorageService
from cpv3.db.session import get_db
from cpv3.modules.transcription.schemas import (
Document,
GoogleSpeechParams,
TranscriptionCreate,
TranscriptionRead,
TranscriptionUpdate,
WhisperParams,
)
from cpv3.modules.transcription.service import (
transcribe_with_google_speech,
transcribe_with_whisper,
)
from cpv3.modules.transcription.repository import TranscriptionRepository
from cpv3.modules.users.models import User
router = APIRouter(prefix="/api/transcribe", tags=["Transcription"])
@router.get("/transcriptions/", response_model=list[TranscriptionRead])
async def list_all_transcriptions(
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> list[TranscriptionRead]:
_ = current_user
repo = TranscriptionRepository(db)
items = await repo.list_all()
return [TranscriptionRead.model_validate(t) for t in items]
@router.post(
"/transcriptions/", response_model=TranscriptionRead, status_code=status.HTTP_201_CREATED
)
async def create_transcription_entry(
body: TranscriptionCreate,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> TranscriptionRead:
_ = current_user
repo = TranscriptionRepository(db)
transcription = await repo.create(body)
return TranscriptionRead.model_validate(transcription)
@router.get("/transcriptions/{transcription_id}/", response_model=TranscriptionRead)
async def retrieve_transcription_entry(
transcription_id: uuid.UUID,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> TranscriptionRead:
_ = current_user
repo = TranscriptionRepository(db)
transcription = await repo.get_by_id(transcription_id)
if transcription is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
return TranscriptionRead.model_validate(transcription)
@router.patch("/transcriptions/{transcription_id}/", response_model=TranscriptionRead)
async def patch_transcription_entry(
transcription_id: uuid.UUID,
body: TranscriptionUpdate,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> TranscriptionRead:
_ = current_user
repo = TranscriptionRepository(db)
transcription = await repo.get_by_id(transcription_id)
if transcription is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
transcription = await repo.update(transcription, body)
return TranscriptionRead.model_validate(transcription)
@router.delete("/transcriptions/{transcription_id}/", status_code=status.HTTP_204_NO_CONTENT)
async def delete_transcription_entry(
transcription_id: uuid.UUID,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> Response:
_ = current_user
repo = TranscriptionRepository(db)
transcription = await repo.get_by_id(transcription_id)
if transcription is None:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
await repo.deactivate(transcription)
return Response(status_code=status.HTTP_204_NO_CONTENT)
@router.post("/whisper/", response_model=Document)
async def whisper_transcribe(
body: WhisperParams,
current_user: User = Depends(get_current_user),
storage: StorageService = Depends(get_storage),
) -> Document:
_ = current_user
return await transcribe_with_whisper(
storage,
file_key=body.file_path,
model_name=body.model_name,
language=body.language,
)
@router.post("/google-speech/", response_model=Document)
async def google_speech_transcribe(
body: GoogleSpeechParams,
current_user: User = Depends(get_current_user),
storage: StorageService = Depends(get_storage),
) -> Document:
_ = current_user
return await transcribe_with_google_speech(
storage,
file_key=body.file_path,
language_codes=body.language_codes,
)
+146
View File
@@ -0,0 +1,146 @@
from __future__ import annotations
from datetime import datetime
from typing import Literal
from uuid import UUID
from cpv3.common.schemas import Schema
TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD"]
class TranscriptionRead(Schema):
id: UUID
project_id: UUID | None
source_file_id: UUID
artifact_id: UUID | None
engine: TranscriptionEngineEnum
language: str | None
document: dict
transcribe_options: dict | None
is_active: bool
created_at: datetime
updated_at: datetime
class TranscriptionCreate(Schema):
project_id: UUID | None = None
source_file_id: UUID
artifact_id: UUID | None = None
engine: TranscriptionEngineEnum = "LOCAL_WHISPER"
language: str | None = None
document: dict
transcribe_options: dict | None = None
class TranscriptionUpdate(Schema):
document: dict | None = None
transcribe_options: dict | None = None
# ---------------------------------- Document ----------------------------------
class Tag(Schema):
name: str
class TimeRange(Schema):
start: float
end: float
class WordNode(Schema):
text: str
semantic_tags: list[Tag]
structure_tags: list[Tag]
time: TimeRange
class LineNode(Schema):
text: str
semantic_tags: list[Tag]
structure_tags: list[Tag]
time: TimeRange
words: list[WordNode]
class SegmentNode(Schema):
text: str
semantic_tags: list[Tag]
structure_tags: list[Tag]
time: TimeRange
lines: list[LineNode]
class Document(Schema):
segments: list[SegmentNode]
class WordOptions(Schema):
highlight_words: bool = False
max_line_width: int = 32
max_line_count: int = 2
# ---------------------------------- Whisper Models ----------------------------------
class WhisperWord(Schema):
word: str
start: float
end: float
probability: float
class WhisperSegment(Schema):
id: int
seek: int
start: float
end: float
text: str
tokens: list[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: float
words: list[WhisperWord]
class WhisperResult(Schema):
text: str
segments: list[WhisperSegment]
language: str
class WhisperParams(Schema):
file_path: str
language: str | None = None
model_name: str = "tiny"
# ---------------------------------- Google Speech Models ----------------------------------
class GoogleSpeechWord(Schema):
word: str
start: float
end: float
class GoogleSpeechSegment(Schema):
text: str
start: float
end: float
words: list[GoogleSpeechWord]
class GoogleSpeechResult(Schema):
text: str
segments: list[GoogleSpeechSegment]
language: str
class GoogleSpeechParams(Schema):
file_path: str
language_codes: list[str] | None = None
+402
View File
@@ -0,0 +1,402 @@
from __future__ import annotations
import asyncio
from tempfile import NamedTemporaryFile
from typing import Callable, cast
import anyio
from cpv3.infrastructure.settings import get_settings
from cpv3.infrastructure.storage.base import StorageService
from cpv3.modules.transcription.constants import (
FIRST_LINE_IN_DOCUMENT,
FIRST_LINE_IN_SEGMENT,
FIRST_SEGMENT_IN_DOCUMENT,
FIRST_WORD_IN_DOCUMENT,
FIRST_WORD_IN_LINE,
FIRST_WORD_IN_SEGMENT,
LAST_LINE_IN_DOCUMENT,
LAST_LINE_IN_SEGMENT,
LAST_SEGMENT_IN_DOCUMENT,
LAST_WORD_IN_DOCUMENT,
LAST_WORD_IN_LINE,
LAST_WORD_IN_SEGMENT,
)
from cpv3.modules.transcription.schemas import (
Document,
GoogleSpeechResult,
GoogleSpeechSegment,
GoogleSpeechWord,
LineNode,
SegmentNode,
Tag,
TimeRange,
WhisperResult,
WhisperSegment,
WhisperWord,
WordNode,
WordOptions,
)
class DocumentBuilder:
def compute_segment_lines(
self, segment: WhisperSegment | GoogleSpeechSegment, max_chars_per_line: int
) -> list[LineNode]:
words = segment.words or []
lines: list[list[WhisperWord | GoogleSpeechWord]] = []
cur_line: list[WhisperWord | GoogleSpeechWord] = []
cur_len = 0
for w in words:
text = (w.word or "").strip()
if not text:
continue
extra = len(text) + (1 if cur_line else 0)
if cur_line and cur_len + extra > max_chars_per_line:
lines.append(cur_line)
cur_line, cur_len = [w], len(text)
else:
cur_line.append(w)
cur_len += extra
if cur_line:
lines.append(cur_line)
result_lines: list[LineNode] = []
for rline in lines:
time = TimeRange(start=rline[0].start, end=rline[-1].end)
word_nodes = [
WordNode(
text=(rword.word or "").strip(),
time=TimeRange(start=rword.start, end=rword.end),
semantic_tags=[],
structure_tags=[],
)
for rword in rline
]
line_node = LineNode(
text=" ".join((rword.word or "") for rword in rline).strip(),
semantic_tags=[],
structure_tags=[],
time=time,
words=word_nodes,
)
result_lines.append(line_node)
return result_lines
def process_line(
self,
line: LineNode,
is_first_line_in_document: bool,
is_last_line_in_document: bool,
is_first_line_in_segment: bool,
is_last_line_in_segment: bool,
) -> list[WordNode]:
words: list[WordNode] = []
for idx, word in enumerate(line.words):
is_first = idx == 0
is_last = idx == len(line.words) - 1
rules = [
(is_first_line_in_document and is_first, FIRST_WORD_IN_DOCUMENT),
(is_last_line_in_document and is_last, LAST_WORD_IN_DOCUMENT),
(is_first_line_in_segment and is_first, FIRST_WORD_IN_SEGMENT),
(is_last_line_in_segment and is_last, LAST_WORD_IN_SEGMENT),
(is_first, FIRST_WORD_IN_LINE),
(is_last, LAST_WORD_IN_LINE),
]
structure_tags = [
Tag(name=tag_name) for condition, tag_name in rules if condition
]
new_word = word.model_copy(update={"structure_tags": structure_tags})
words.append(new_word)
return words
def process_segment(
self,
segment: SegmentNode,
is_first_segment_in_document: bool,
is_last_segment_in_document: bool,
) -> list[LineNode]:
lines: list[LineNode] = []
for idx, line in enumerate(segment.lines):
is_first = idx == 0
is_last = idx == len(segment.lines) - 1
rules = [
(is_first_segment_in_document and is_first, FIRST_LINE_IN_DOCUMENT),
(is_last_segment_in_document and is_last, LAST_LINE_IN_DOCUMENT),
(is_first, FIRST_LINE_IN_SEGMENT),
(is_last, LAST_LINE_IN_SEGMENT),
]
structure_tags = [
Tag(name=tag_name) for condition, tag_name in rules if condition
]
words = self.process_line(
line,
is_first_line_in_document=is_first_segment_in_document and is_first,
is_last_line_in_document=is_last_segment_in_document and is_last,
is_first_line_in_segment=is_first,
is_last_line_in_segment=is_last,
)
new_line = line.model_copy(
update={"structure_tags": structure_tags, "words": words}
)
lines.append(new_line)
return lines
def process_document(self, document: Document) -> Document:
segments: list[SegmentNode] = []
for idx, segment in enumerate(document.segments):
structure_tags: list[Tag] = []
is_first_segment_in_document = idx == 0
is_last_segment_in_document = idx == len(document.segments) - 1
if is_first_segment_in_document:
structure_tags.append(Tag(name=FIRST_SEGMENT_IN_DOCUMENT))
if is_last_segment_in_document:
structure_tags.append(Tag(name=LAST_SEGMENT_IN_DOCUMENT))
lines = self.process_segment(
segment, is_first_segment_in_document, is_last_segment_in_document
)
new_segment = segment.model_copy(
update={"lines": lines, "structure_tags": structure_tags}
)
segments.append(new_segment)
return Document(segments=segments)
async def _convert_local_to_ogg(input_path: str) -> tuple[str, Callable[[], None]]:
with NamedTemporaryFile(suffix=".ogg", delete=False) as out:
out_path = out.name
proc = await asyncio.create_subprocess_exec(
"ffmpeg",
"-y",
"-i",
input_path,
"-c:a",
"libopus",
"-b:a",
"24k",
"-vn",
"-ac",
"1",
"-ar",
"16000",
out_path,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {stderr.decode(errors='ignore')}")
def _cleanup() -> None:
import os
if os.path.exists(out_path):
os.remove(out_path)
return out_path, _cleanup
def _make_document_from_segments(
builder: DocumentBuilder,
segments: list[WhisperSegment] | list[GoogleSpeechSegment],
*,
max_line_width: int,
) -> Document:
result_segments: list[SegmentNode] = []
for segment in segments:
lines = builder.compute_segment_lines(segment, max_line_width)
time = TimeRange(start=segment.start, end=segment.end)
segment_node = SegmentNode(
text=segment.text.strip(),
semantic_tags=[],
structure_tags=[],
time=time,
lines=lines,
)
result_segments.append(segment_node)
return Document(segments=result_segments)
def _whisper_transcribe_sync(
*,
local_file_path: str,
model_name: str,
language: str | None,
) -> Document:
import whisper # type: ignore[import-untyped]
settings = get_settings()
settings.transcription_models_dir.mkdir(parents=True, exist_ok=True)
builder = DocumentBuilder()
model = whisper.load_model(
model_name, download_root=str(settings.transcription_models_dir)
)
if language is None:
audio = whisper.load_audio(local_file_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(
model.device
)
_, probs_raw = model.detect_language(mel)
probs = cast(dict[str, float], probs_raw)
language = max(probs, key=lambda k: probs[k])
result = whisper.transcribe(
audio=whisper.load_audio(local_file_path),
model=model,
word_timestamps=True,
temperature=0.2,
language=language,
verbose=False,
)
parsed = WhisperResult.model_validate(result)
words_options = WordOptions(
highlight_words=True,
max_line_width=32,
max_line_count=2,
)
document = _make_document_from_segments(
builder, parsed.segments, max_line_width=words_options.max_line_width
)
return builder.process_document(document)
async def transcribe_with_whisper(
storage: StorageService,
*,
file_key: str,
model_name: str = "tiny",
language: str | None = None,
) -> Document:
tmp = await storage.download_to_temp(file_key)
try:
return await anyio.to_thread.run_sync(
lambda: _whisper_transcribe_sync(
local_file_path=tmp.path,
model_name=model_name,
language=language,
)
)
finally:
tmp.cleanup()
def _google_transcribe_sync(
*, ogg_bytes: bytes, language_codes: list[str]
) -> GoogleSpeechResult:
from google.cloud import speech
settings = get_settings()
client: speech.SpeechClient = speech.SpeechClient.from_service_account_file(
str(settings.google_service_key_path)
)
audio = speech.RecognitionAudio(content=ogg_bytes)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
sample_rate_hertz=16000,
language_code=language_codes[0],
alternative_language_codes=(
language_codes[1:] if len(language_codes) > 1 else []
),
model="latest_long",
enable_word_time_offsets=True,
)
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=600)
segments: list[GoogleSpeechSegment] = []
full_text = ""
for result in response.results:
alternative = result.alternatives[0]
words: list[GoogleSpeechWord] = []
for word_info in alternative.words:
words.append(
GoogleSpeechWord(
word=word_info.word,
start=word_info.start_time.total_seconds(),
end=word_info.end_time.total_seconds(),
)
)
if words:
segment_text = alternative.transcript
full_text += segment_text + " "
segments.append(
GoogleSpeechSegment(
text=segment_text,
start=words[0].start,
end=words[-1].end,
words=words,
)
)
return GoogleSpeechResult(
text=full_text.strip(), segments=segments, language=language_codes[0]
)
async def transcribe_with_google_speech(
storage: StorageService,
*,
file_key: str,
language_codes: list[str] | None = None,
) -> Document:
language_codes = language_codes or ["ru-RU", "en-US"]
builder = DocumentBuilder()
words_options = WordOptions()
input_tmp = await storage.download_to_temp(file_key)
try:
ogg_path, ogg_cleanup = await _convert_local_to_ogg(input_tmp.path)
try:
with open(ogg_path, "rb") as f:
content = f.read()
result = await anyio.to_thread.run_sync(
lambda: _google_transcribe_sync(
ogg_bytes=content, language_codes=language_codes
)
)
document = _make_document_from_segments(
builder, result.segments, max_line_width=words_options.max_line_width
)
return builder.process_document(document)
finally:
ogg_cleanup()
finally:
input_tmp.cleanup()