init: new structure + fix lint errors

2026-02-03 02:15:07 +03:00
commit 67e0f22b4f
89 changed files with 7654 additions and 0 deletions
@@ -0,0 +1,15 @@
+FIRST_WORD_IN_DOCUMENT = "first-word-in-document"
+FIRST_WORD_IN_SEGMENT = "first-word-in-segment"
+FIRST_WORD_IN_LINE = "first-word-in-line"
+
+LAST_WORD_IN_DOCUMENT = "last-word-in-document"
+LAST_WORD_IN_SEGMENT = "last-word-in-segment"
+LAST_WORD_IN_LINE = "last-word-in-line"
+
+FIRST_LINE_IN_DOCUMENT = "first-line-in-document"
+FIRST_LINE_IN_SEGMENT = "first-line-in-segment"
+LAST_LINE_IN_DOCUMENT = "last-line-in-document"
+LAST_LINE_IN_SEGMENT = "last-line-in-segment"
+
+FIRST_SEGMENT_IN_DOCUMENT = "first-segment-in-document"
+LAST_SEGMENT_IN_DOCUMENT = "last-segment-in-document"
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import uuid
+
+from sqlalchemy import JSON, ForeignKey, String
+from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import Mapped, mapped_column
+
+from cpv3.db.base import Base, BaseModelMixin
+
+
+class Transcription(Base, BaseModelMixin):
+    __tablename__ = "transcriptions"
+
+    project_id: Mapped[uuid.UUID | None] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("projects.id", ondelete="RESTRICT"),
+        nullable=True,
+        index=True,
+    )
+    source_file_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True), ForeignKey("files.id", ondelete="RESTRICT"), index=True
+    )
+    artifact_id: Mapped[uuid.UUID | None] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("artifact_media_files.id", ondelete="RESTRICT"),
+        nullable=True,
+        index=True,
+    )
+
+    engine: Mapped[str] = mapped_column(String(32), default="LOCAL_WHISPER")
+    language: Mapped[str | None] = mapped_column(String(3), nullable=True)
+
+    document: Mapped[dict] = mapped_column(JSON)
+    transcribe_options: Mapped[dict | None] = mapped_column(JSON, nullable=True)
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import uuid
+
+from sqlalchemy import Select, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from cpv3.modules.transcription.models import Transcription
+from cpv3.modules.transcription.schemas import TranscriptionCreate, TranscriptionUpdate
+
+
+class TranscriptionRepository:
+    """Repository for Transcription database operations."""
+
+    def __init__(self, session: AsyncSession) -> None:
+        self._session = session
+
+    async def list_all(self) -> list[Transcription]:
+        stmt: Select[tuple[Transcription]] = select(Transcription).where(
+            Transcription.is_active.is_(True)
+        )
+        result = await self._session.execute(
+            stmt.order_by(Transcription.created_at.desc())
+        )
+        return list(result.scalars().all())
+
+    async def get_by_id(self, transcription_id: uuid.UUID) -> Transcription | None:
+        result = await self._session.execute(
+            select(Transcription)
+            .where(Transcription.id == transcription_id)
+            .where(Transcription.is_active.is_(True))
+        )
+        return result.scalar_one_or_none()
+
+    async def create(self, data: TranscriptionCreate) -> Transcription:
+        transcription = Transcription(
+            project_id=data.project_id,
+            source_file_id=data.source_file_id,
+            artifact_id=data.artifact_id,
+            engine=data.engine,
+            language=data.language,
+            document=data.document,
+            transcribe_options=data.transcribe_options,
+        )
+
+        self._session.add(transcription)
+        await self._session.commit()
+        await self._session.refresh(transcription)
+        return transcription
+
+    async def update(
+        self, transcription: Transcription, data: TranscriptionUpdate
+    ) -> Transcription:
+        for key, value in data.model_dump(exclude_unset=True).items():
+            if value is not None:
+                setattr(transcription, key, value)
+
+        await self._session.commit()
+        await self._session.refresh(transcription)
+        return transcription
+
+    async def deactivate(self, transcription: Transcription) -> None:
+        transcription.is_active = False
+        await self._session.commit()
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import uuid
+
+from fastapi import APIRouter, Depends, HTTPException, Response, status
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from cpv3.infrastructure.auth import get_current_user
+from cpv3.infrastructure.deps import get_storage
+from cpv3.infrastructure.storage.base import StorageService
+from cpv3.db.session import get_db
+from cpv3.modules.transcription.schemas import (
+    Document,
+    GoogleSpeechParams,
+    TranscriptionCreate,
+    TranscriptionRead,
+    TranscriptionUpdate,
+    WhisperParams,
+)
+from cpv3.modules.transcription.service import (
+    transcribe_with_google_speech,
+    transcribe_with_whisper,
+)
+from cpv3.modules.transcription.repository import TranscriptionRepository
+from cpv3.modules.users.models import User
+
+router = APIRouter(prefix="/api/transcribe", tags=["Transcription"])
+
+
+@router.get("/transcriptions/", response_model=list[TranscriptionRead])
+async def list_all_transcriptions(
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+) -> list[TranscriptionRead]:
+    _ = current_user
+    repo = TranscriptionRepository(db)
+    items = await repo.list_all()
+    return [TranscriptionRead.model_validate(t) for t in items]
+
+
+@router.post(
+    "/transcriptions/", response_model=TranscriptionRead, status_code=status.HTTP_201_CREATED
+)
+async def create_transcription_entry(
+    body: TranscriptionCreate,
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+) -> TranscriptionRead:
+    _ = current_user
+    repo = TranscriptionRepository(db)
+    transcription = await repo.create(body)
+    return TranscriptionRead.model_validate(transcription)
+
+
+@router.get("/transcriptions/{transcription_id}/", response_model=TranscriptionRead)
+async def retrieve_transcription_entry(
+    transcription_id: uuid.UUID,
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+) -> TranscriptionRead:
+    _ = current_user
+    repo = TranscriptionRepository(db)
+    transcription = await repo.get_by_id(transcription_id)
+    if transcription is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
+
+    return TranscriptionRead.model_validate(transcription)
+
+
+@router.patch("/transcriptions/{transcription_id}/", response_model=TranscriptionRead)
+async def patch_transcription_entry(
+    transcription_id: uuid.UUID,
+    body: TranscriptionUpdate,
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+) -> TranscriptionRead:
+    _ = current_user
+    repo = TranscriptionRepository(db)
+    transcription = await repo.get_by_id(transcription_id)
+    if transcription is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
+
+    transcription = await repo.update(transcription, body)
+    return TranscriptionRead.model_validate(transcription)
+
+
+@router.delete("/transcriptions/{transcription_id}/", status_code=status.HTTP_204_NO_CONTENT)
+async def delete_transcription_entry(
+    transcription_id: uuid.UUID,
+    current_user: User = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db),
+) -> Response:
+    _ = current_user
+    repo = TranscriptionRepository(db)
+    transcription = await repo.get_by_id(transcription_id)
+    if transcription is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
+
+    await repo.deactivate(transcription)
+    return Response(status_code=status.HTTP_204_NO_CONTENT)
+
+
+@router.post("/whisper/", response_model=Document)
+async def whisper_transcribe(
+    body: WhisperParams,
+    current_user: User = Depends(get_current_user),
+    storage: StorageService = Depends(get_storage),
+) -> Document:
+    _ = current_user
+    return await transcribe_with_whisper(
+        storage,
+        file_key=body.file_path,
+        model_name=body.model_name,
+        language=body.language,
+    )
+
+
+@router.post("/google-speech/", response_model=Document)
+async def google_speech_transcribe(
+    body: GoogleSpeechParams,
+    current_user: User = Depends(get_current_user),
+    storage: StorageService = Depends(get_storage),
+) -> Document:
+    _ = current_user
+    return await transcribe_with_google_speech(
+        storage,
+        file_key=body.file_path,
+        language_codes=body.language_codes,
+    )
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Literal
+from uuid import UUID
+
+from cpv3.common.schemas import Schema
+
+
+TranscriptionEngineEnum = Literal["LOCAL_WHISPER", "GOOGLE_SPEECH_CLOUD"]
+
+
+class TranscriptionRead(Schema):
+    id: UUID
+    project_id: UUID | None
+    source_file_id: UUID
+    artifact_id: UUID | None
+
+    engine: TranscriptionEngineEnum
+    language: str | None
+
+    document: dict
+    transcribe_options: dict | None
+
+    is_active: bool
+    created_at: datetime
+    updated_at: datetime
+
+
+class TranscriptionCreate(Schema):
+    project_id: UUID | None = None
+    source_file_id: UUID
+    artifact_id: UUID | None = None
+
+    engine: TranscriptionEngineEnum = "LOCAL_WHISPER"
+    language: str | None = None
+
+    document: dict
+    transcribe_options: dict | None = None
+
+
+class TranscriptionUpdate(Schema):
+    document: dict | None = None
+    transcribe_options: dict | None = None
+
+
+# ---------------------------------- Document ----------------------------------
+class Tag(Schema):
+    name: str
+
+
+class TimeRange(Schema):
+    start: float
+    end: float
+
+
+class WordNode(Schema):
+    text: str
+    semantic_tags: list[Tag]
+    structure_tags: list[Tag]
+    time: TimeRange
+
+
+class LineNode(Schema):
+    text: str
+    semantic_tags: list[Tag]
+    structure_tags: list[Tag]
+    time: TimeRange
+    words: list[WordNode]
+
+
+class SegmentNode(Schema):
+    text: str
+    semantic_tags: list[Tag]
+    structure_tags: list[Tag]
+    time: TimeRange
+    lines: list[LineNode]
+
+
+class Document(Schema):
+    segments: list[SegmentNode]
+
+
+class WordOptions(Schema):
+    highlight_words: bool = False
+    max_line_width: int = 32
+    max_line_count: int = 2
+
+
+# ---------------------------------- Whisper Models ----------------------------------
+class WhisperWord(Schema):
+    word: str
+    start: float
+    end: float
+    probability: float
+
+
+class WhisperSegment(Schema):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: list[WhisperWord]
+
+
+class WhisperResult(Schema):
+    text: str
+    segments: list[WhisperSegment]
+    language: str
+
+
+class WhisperParams(Schema):
+    file_path: str
+    language: str | None = None
+    model_name: str = "tiny"
+
+
+# ---------------------------------- Google Speech Models ----------------------------------
+class GoogleSpeechWord(Schema):
+    word: str
+    start: float
+    end: float
+
+
+class GoogleSpeechSegment(Schema):
+    text: str
+    start: float
+    end: float
+    words: list[GoogleSpeechWord]
+
+
+class GoogleSpeechResult(Schema):
+    text: str
+    segments: list[GoogleSpeechSegment]
+    language: str
+
+
+class GoogleSpeechParams(Schema):
+    file_path: str
+    language_codes: list[str] | None = None
@@ -0,0 +1,402 @@
+from __future__ import annotations
+
+import asyncio
+from tempfile import NamedTemporaryFile
+from typing import Callable, cast
+
+import anyio
+
+from cpv3.infrastructure.settings import get_settings
+from cpv3.infrastructure.storage.base import StorageService
+from cpv3.modules.transcription.constants import (
+    FIRST_LINE_IN_DOCUMENT,
+    FIRST_LINE_IN_SEGMENT,
+    FIRST_SEGMENT_IN_DOCUMENT,
+    FIRST_WORD_IN_DOCUMENT,
+    FIRST_WORD_IN_LINE,
+    FIRST_WORD_IN_SEGMENT,
+    LAST_LINE_IN_DOCUMENT,
+    LAST_LINE_IN_SEGMENT,
+    LAST_SEGMENT_IN_DOCUMENT,
+    LAST_WORD_IN_DOCUMENT,
+    LAST_WORD_IN_LINE,
+    LAST_WORD_IN_SEGMENT,
+)
+from cpv3.modules.transcription.schemas import (
+    Document,
+    GoogleSpeechResult,
+    GoogleSpeechSegment,
+    GoogleSpeechWord,
+    LineNode,
+    SegmentNode,
+    Tag,
+    TimeRange,
+    WhisperResult,
+    WhisperSegment,
+    WhisperWord,
+    WordNode,
+    WordOptions,
+)
+
+
+class DocumentBuilder:
+    def compute_segment_lines(
+        self, segment: WhisperSegment | GoogleSpeechSegment, max_chars_per_line: int
+    ) -> list[LineNode]:
+        words = segment.words or []
+        lines: list[list[WhisperWord | GoogleSpeechWord]] = []
+        cur_line: list[WhisperWord | GoogleSpeechWord] = []
+        cur_len = 0
+
+        for w in words:
+            text = (w.word or "").strip()
+            if not text:
+                continue
+
+            extra = len(text) + (1 if cur_line else 0)
+
+            if cur_line and cur_len + extra > max_chars_per_line:
+                lines.append(cur_line)
+                cur_line, cur_len = [w], len(text)
+            else:
+                cur_line.append(w)
+                cur_len += extra
+
+        if cur_line:
+            lines.append(cur_line)
+
+        result_lines: list[LineNode] = []
+        for rline in lines:
+            time = TimeRange(start=rline[0].start, end=rline[-1].end)
+
+            word_nodes = [
+                WordNode(
+                    text=(rword.word or "").strip(),
+                    time=TimeRange(start=rword.start, end=rword.end),
+                    semantic_tags=[],
+                    structure_tags=[],
+                )
+                for rword in rline
+            ]
+
+            line_node = LineNode(
+                text=" ".join((rword.word or "") for rword in rline).strip(),
+                semantic_tags=[],
+                structure_tags=[],
+                time=time,
+                words=word_nodes,
+            )
+            result_lines.append(line_node)
+
+        return result_lines
+
+    def process_line(
+        self,
+        line: LineNode,
+        is_first_line_in_document: bool,
+        is_last_line_in_document: bool,
+        is_first_line_in_segment: bool,
+        is_last_line_in_segment: bool,
+    ) -> list[WordNode]:
+        words: list[WordNode] = []
+        for idx, word in enumerate(line.words):
+            is_first = idx == 0
+            is_last = idx == len(line.words) - 1
+
+            rules = [
+                (is_first_line_in_document and is_first, FIRST_WORD_IN_DOCUMENT),
+                (is_last_line_in_document and is_last, LAST_WORD_IN_DOCUMENT),
+                (is_first_line_in_segment and is_first, FIRST_WORD_IN_SEGMENT),
+                (is_last_line_in_segment and is_last, LAST_WORD_IN_SEGMENT),
+                (is_first, FIRST_WORD_IN_LINE),
+                (is_last, LAST_WORD_IN_LINE),
+            ]
+
+            structure_tags = [
+                Tag(name=tag_name) for condition, tag_name in rules if condition
+            ]
+
+            new_word = word.model_copy(update={"structure_tags": structure_tags})
+            words.append(new_word)
+
+        return words
+
+    def process_segment(
+        self,
+        segment: SegmentNode,
+        is_first_segment_in_document: bool,
+        is_last_segment_in_document: bool,
+    ) -> list[LineNode]:
+        lines: list[LineNode] = []
+        for idx, line in enumerate(segment.lines):
+            is_first = idx == 0
+            is_last = idx == len(segment.lines) - 1
+
+            rules = [
+                (is_first_segment_in_document and is_first, FIRST_LINE_IN_DOCUMENT),
+                (is_last_segment_in_document and is_last, LAST_LINE_IN_DOCUMENT),
+                (is_first, FIRST_LINE_IN_SEGMENT),
+                (is_last, LAST_LINE_IN_SEGMENT),
+            ]
+
+            structure_tags = [
+                Tag(name=tag_name) for condition, tag_name in rules if condition
+            ]
+
+            words = self.process_line(
+                line,
+                is_first_line_in_document=is_first_segment_in_document and is_first,
+                is_last_line_in_document=is_last_segment_in_document and is_last,
+                is_first_line_in_segment=is_first,
+                is_last_line_in_segment=is_last,
+            )
+
+            new_line = line.model_copy(
+                update={"structure_tags": structure_tags, "words": words}
+            )
+            lines.append(new_line)
+
+        return lines
+
+    def process_document(self, document: Document) -> Document:
+        segments: list[SegmentNode] = []
+
+        for idx, segment in enumerate(document.segments):
+            structure_tags: list[Tag] = []
+            is_first_segment_in_document = idx == 0
+            is_last_segment_in_document = idx == len(document.segments) - 1
+
+            if is_first_segment_in_document:
+                structure_tags.append(Tag(name=FIRST_SEGMENT_IN_DOCUMENT))
+            if is_last_segment_in_document:
+                structure_tags.append(Tag(name=LAST_SEGMENT_IN_DOCUMENT))
+
+            lines = self.process_segment(
+                segment, is_first_segment_in_document, is_last_segment_in_document
+            )
+            new_segment = segment.model_copy(
+                update={"lines": lines, "structure_tags": structure_tags}
+            )
+            segments.append(new_segment)
+
+        return Document(segments=segments)
+
+
+async def _convert_local_to_ogg(input_path: str) -> tuple[str, Callable[[], None]]:
+    with NamedTemporaryFile(suffix=".ogg", delete=False) as out:
+        out_path = out.name
+
+    proc = await asyncio.create_subprocess_exec(
+        "ffmpeg",
+        "-y",
+        "-i",
+        input_path,
+        "-c:a",
+        "libopus",
+        "-b:a",
+        "24k",
+        "-vn",
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        out_path,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    _, stderr = await proc.communicate()
+    if proc.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed: {stderr.decode(errors='ignore')}")
+
+    def _cleanup() -> None:
+        import os
+
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+    return out_path, _cleanup
+
+
+def _make_document_from_segments(
+    builder: DocumentBuilder,
+    segments: list[WhisperSegment] | list[GoogleSpeechSegment],
+    *,
+    max_line_width: int,
+) -> Document:
+    result_segments: list[SegmentNode] = []
+
+    for segment in segments:
+        lines = builder.compute_segment_lines(segment, max_line_width)
+        time = TimeRange(start=segment.start, end=segment.end)
+        segment_node = SegmentNode(
+            text=segment.text.strip(),
+            semantic_tags=[],
+            structure_tags=[],
+            time=time,
+            lines=lines,
+        )
+        result_segments.append(segment_node)
+
+    return Document(segments=result_segments)
+
+
+def _whisper_transcribe_sync(
+    *,
+    local_file_path: str,
+    model_name: str,
+    language: str | None,
+) -> Document:
+    import whisper  # type: ignore[import-untyped]
+
+    settings = get_settings()
+    settings.transcription_models_dir.mkdir(parents=True, exist_ok=True)
+
+    builder = DocumentBuilder()
+
+    model = whisper.load_model(
+        model_name, download_root=str(settings.transcription_models_dir)
+    )
+
+    if language is None:
+        audio = whisper.load_audio(local_file_path)
+        audio = whisper.pad_or_trim(audio)
+        mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(
+            model.device
+        )
+        _, probs_raw = model.detect_language(mel)
+        probs = cast(dict[str, float], probs_raw)
+        language = max(probs, key=lambda k: probs[k])
+
+    result = whisper.transcribe(
+        audio=whisper.load_audio(local_file_path),
+        model=model,
+        word_timestamps=True,
+        temperature=0.2,
+        language=language,
+        verbose=False,
+    )
+
+    parsed = WhisperResult.model_validate(result)
+
+    words_options = WordOptions(
+        highlight_words=True,
+        max_line_width=32,
+        max_line_count=2,
+    )
+
+    document = _make_document_from_segments(
+        builder, parsed.segments, max_line_width=words_options.max_line_width
+    )
+    return builder.process_document(document)
+
+
+async def transcribe_with_whisper(
+    storage: StorageService,
+    *,
+    file_key: str,
+    model_name: str = "tiny",
+    language: str | None = None,
+) -> Document:
+    tmp = await storage.download_to_temp(file_key)
+    try:
+        return await anyio.to_thread.run_sync(
+            lambda: _whisper_transcribe_sync(
+                local_file_path=tmp.path,
+                model_name=model_name,
+                language=language,
+            )
+        )
+    finally:
+        tmp.cleanup()
+
+
+def _google_transcribe_sync(
+    *, ogg_bytes: bytes, language_codes: list[str]
+) -> GoogleSpeechResult:
+    from google.cloud import speech
+
+    settings = get_settings()
+
+    client: speech.SpeechClient = speech.SpeechClient.from_service_account_file(
+        str(settings.google_service_key_path)
+    )
+
+    audio = speech.RecognitionAudio(content=ogg_bytes)
+    config = speech.RecognitionConfig(
+        encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
+        sample_rate_hertz=16000,
+        language_code=language_codes[0],
+        alternative_language_codes=(
+            language_codes[1:] if len(language_codes) > 1 else []
+        ),
+        model="latest_long",
+        enable_word_time_offsets=True,
+    )
+
+    operation = client.long_running_recognize(config=config, audio=audio)
+    response = operation.result(timeout=600)
+
+    segments: list[GoogleSpeechSegment] = []
+    full_text = ""
+
+    for result in response.results:
+        alternative = result.alternatives[0]
+        words: list[GoogleSpeechWord] = []
+        for word_info in alternative.words:
+            words.append(
+                GoogleSpeechWord(
+                    word=word_info.word,
+                    start=word_info.start_time.total_seconds(),
+                    end=word_info.end_time.total_seconds(),
+                )
+            )
+
+        if words:
+            segment_text = alternative.transcript
+            full_text += segment_text + " "
+            segments.append(
+                GoogleSpeechSegment(
+                    text=segment_text,
+                    start=words[0].start,
+                    end=words[-1].end,
+                    words=words,
+                )
+            )
+
+    return GoogleSpeechResult(
+        text=full_text.strip(), segments=segments, language=language_codes[0]
+    )
+
+
+async def transcribe_with_google_speech(
+    storage: StorageService,
+    *,
+    file_key: str,
+    language_codes: list[str] | None = None,
+) -> Document:
+    language_codes = language_codes or ["ru-RU", "en-US"]
+
+    builder = DocumentBuilder()
+    words_options = WordOptions()
+
+    input_tmp = await storage.download_to_temp(file_key)
+    try:
+        ogg_path, ogg_cleanup = await _convert_local_to_ogg(input_tmp.path)
+        try:
+            with open(ogg_path, "rb") as f:
+                content = f.read()
+
+            result = await anyio.to_thread.run_sync(
+                lambda: _google_transcribe_sync(
+                    ogg_bytes=content, language_codes=language_codes
+                )
+            )
+
+            document = _make_document_from_segments(
+                builder, result.segments, max_line_width=words_options.max_line_width
+            )
+            return builder.process_document(document)
+        finally:
+            ogg_cleanup()
+    finally:
+        input_tmp.cleanup()