From 27e03cc56cd48dd43df2bad6fea4f54effdaf7c0 Mon Sep 17 00:00:00 2001
From: Daniil <bkom01@icloud.com>
Date: Sun, 22 Mar 2026 22:42:35 +0300
Subject: [PATCH] feat: rename Product Strategist to Product Lead, add lead
 coordination + dual-mode

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../.gitkeep                                  |    0
 .claude/agents/debug-specialist.md            |   23 +-
 .claude/agents/devops-engineer.md             |   23 +-
 .claude/agents/ml-ai-engineer.md              |   22 +-
 .claude/agents/orchestrator.md                |   47 +-
 ...{product-strategist.md => product-lead.md} |   56 +-
 .claude/agents/technical-writer.md            |   20 +-
 .claude/agents/ui-ux-designer.md              |   20 +-
 .claude/rules/agent-pipeline.md               |   27 +
 .gitignore                                    |    3 +
 CLAUDE.md                                     |   24 +
 docs/bug-reports/2026-03-22_bugreport.html    |  682 +++++++++
 docs/consults/video-features-roadmap_v1.md    |  416 +++++
 .../video-features-roadmap_v1_ru.html         |  984 ++++++++++++
 docs/consults/video-features-roadmap_v1_ru.md |  432 ++++++
 docs/consults/video-features-roadmap_v2.md    |  515 +++++++
 .../video-features-roadmap_v2_ru.html         | 1341 +++++++++++++++++
 docs/consults/video-features-roadmap_v2_ru.md |  537 +++++++
 .../2026-03-21-advanced-remotion-templates.md |  918 +++++++++++
 ...3-21-advanced-remotion-templates-design.md |  229 +++
 20 files changed, 6305 insertions(+), 14 deletions(-)
 rename .claude/agents-memory/{product-strategist => product-lead}/.gitkeep (100%)
 rename .claude/agents/{product-strategist.md => product-lead.md} (91%)
 create mode 100644 .claude/rules/agent-pipeline.md
 create mode 100644 docs/bug-reports/2026-03-22_bugreport.html
 create mode 100644 docs/consults/video-features-roadmap_v1.md
 create mode 100644 docs/consults/video-features-roadmap_v1_ru.html
 create mode 100644 docs/consults/video-features-roadmap_v1_ru.md
 create mode 100644 docs/consults/video-features-roadmap_v2.md
 create mode 100644 docs/consults/video-features-roadmap_v2_ru.html
 create mode 100644 docs/consults/video-features-roadmap_v2_ru.md
 create mode 100644 docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md
 create mode 100644 docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md

diff --git a/.claude/agents-memory/product-strategist/.gitkeep b/.claude/agents-memory/product-lead/.gitkeep
similarity index 100%
rename from .claude/agents-memory/product-strategist/.gitkeep
rename to .claude/agents-memory/product-lead/.gitkeep
diff --git a/.claude/agents/debug-specialist.md b/.claude/agents/debug-specialist.md
index 1c1097a..8ca48eb 100644
--- a/.claude/agents/debug-specialist.md
+++ b/.claude/agents/debug-specialist.md
@@ -1,10 +1,9 @@
 ---
 name: debug-specialist
 description: Senior Debugging Engineer — systematic root cause analysis, cross-service debugging, hypothesis-driven investigation, reproduction strategies.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan, mcp__redis__client_list, mcp__redis__create_vector_index_hash, mcp__redis__dbsize, mcp__redis__delete, mcp__redis__expire, mcp__redis__get, mcp__redis__get_index_info, mcp__redis__get_indexed_keys_number, mcp__redis__get_indexes, mcp__redis__get_vector_from_hash, mcp__redis__hdel, mcp__redis__hexists, mcp__redis__hget, mcp__redis__hgetall, mcp__redis__hset, mcp__redis__hybrid_search, mcp__redis__info, mcp__redis__json_del, mcp__redis__json_get, mcp__redis__json_set, mcp__redis__llen, mcp__redis__lpop, mcp__redis__lpush, mcp__redis__lrange, mcp__redis__lrem, mcp__redis__publish, mcp__redis__rename, mcp__redis__rpop, mcp__redis__rpush, mcp__redis__sadd, mcp__redis__scan_all_keys, mcp__redis__scan_keys, mcp__redis__search_redis_documents, mcp__redis__set, mcp__redis__set_vector_in_hash, mcp__redis__smembers, mcp__redis__srem, mcp__redis__subscribe, mcp__redis__type, mcp__redis__unsubscribe, mcp__redis__vector_search_hash, mcp__redis__xadd, mcp__redis__xdel, mcp__redis__xrange, mcp__redis__zadd, mcp__redis__zrange, mcp__redis__zrem
 model: opus
 ---
-<!-- TODO: Add Redis MCP tool names after server discovery -->
 
 # First Step
 
@@ -506,6 +505,26 @@ Common handoff patterns for Debug Specialist:
 
 If you have no handoffs needed, omit the Handoff Requests section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main investigation.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Quick searches for error patterns, stack trace origins, related files |
+| `feature-dev:code-explorer` | Sonnet | Trace execution paths end-to-end to pinpoint where the bug originates |
+| `feature-dev:code-reviewer` | Sonnet | Review code adjacent to root cause for related bugs, race conditions, error handling gaps |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all files that import or reference [function/class]. Thoroughness: quick")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace the full execution path for [operation] from entry point to completion. Map every error handling branch and state change.")
+Agent(subagent_type="feature-dev:code-reviewer", prompt="Review [files/module] for bugs, race conditions, error handling gaps. Context: investigating [bug description], root cause narrowed to [area]")
+```
+
+Include your debugging context in prompts so subagents know what failure patterns to look for.
+
 ## Quality Standard
 
 Your output must be:
diff --git a/.claude/agents/devops-engineer.md b/.claude/agents/devops-engineer.md
index 2b3d4d2..4c6c728 100644
--- a/.claude/agents/devops-engineer.md
+++ b/.claude/agents/devops-engineer.md
@@ -1,10 +1,9 @@
 ---
 name: devops-engineer
 description: Senior Platform Engineer — CI/CD, Docker, Kubernetes, infrastructure as code, monitoring, deployment strategies.
-tools: Read, Grep, Glob, Bash, Edit, Write, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
+tools: Read, Grep, Glob, Bash, Edit, Write, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__docker__list_containers, mcp__docker__create_container, mcp__docker__run_container, mcp__docker__start_container, mcp__docker__stop_container, mcp__docker__remove_container, mcp__docker__recreate_container, mcp__docker__fetch_container_logs, mcp__docker__list_images, mcp__docker__pull_image, mcp__docker__push_image, mcp__docker__build_image, mcp__docker__remove_image, mcp__docker__list_networks, mcp__docker__create_network, mcp__docker__remove_network, mcp__docker__list_volumes, mcp__docker__create_volume, mcp__docker__remove_volume
 model: opus
 ---
-<!-- TODO: Add Docker MCP tool names after server discovery -->
 
 # First Step
 
@@ -591,6 +590,26 @@ When you need another agent's expertise, include this in your output:
 
 If you have no handoffs, omit the Handoff Requests section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main analysis.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Find Docker/CI/config files, environment variable usage, port mappings |
+| `feature-dev:code-explorer` | Sonnet | Trace service dependencies, build pipeline, container startup sequences |
+| `feature-dev:code-reviewer` | Sonnet | Review Dockerfiles, compose configs, CI files for misconfigurations, security issues |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all Dockerfiles, docker-compose files, and CI config files in the monorepo. Thoroughness: medium")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace how the [service] container starts up — from Dockerfile through entrypoint to the running application. Map environment variables, volumes, and network dependencies.")
+Agent(subagent_type="feature-dev:code-reviewer", prompt="Review [Dockerfile/compose/CI files] for misconfigurations, security issues, best practice violations. Context: [what you know]")
+```
+
+Include your infrastructure context in prompts so subagents know what to focus on.
+
 ## Quality Standard
 
 Your output must be:
diff --git a/.claude/agents/ml-ai-engineer.md b/.claude/agents/ml-ai-engineer.md
index d87a81b..6eca6bb 100644
--- a/.claude/agents/ml-ai-engineer.md
+++ b/.claude/agents/ml-ai-engineer.md
@@ -1,7 +1,7 @@
 ---
 name: ml-ai-engineer
 description: Senior ML Engineer — speech-to-text models, transcription optimization, NLP, model deployment, cost/quality trade-offs.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
 model: opus
 ---
 
@@ -541,6 +541,26 @@ Common handoff patterns for ML/AI Engineer:
 
 If you have no handoffs, omit the Handoff Requests section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main analysis.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Find model configs, transcription pipeline code, engine integrations |
+| `feature-dev:code-explorer` | Sonnet | Trace ML pipeline from audio input through model inference to transcription output |
+| `feature-dev:code-architect` | Sonnet | Design architecture for new engine integrations or pipeline changes |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all transcription-related code: engine configs, model definitions, Dramatiq actors, and audio processing. Thoroughness: very thorough")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace the full transcription pipeline from file upload through engine selection, model inference, to Document output. Map all configuration points and error handlers.")
+Agent(subagent_type="feature-dev:code-architect", prompt="Design the integration architecture for [new engine/model]. Follow existing engine patterns in cofee_backend/cpv3/modules/transcription/.")
+```
+
+Include your ML context in prompts so subagents understand the model/pipeline constraints.
+
 ## Quality Standard
 
 Your output must be:
diff --git a/.claude/agents/orchestrator.md b/.claude/agents/orchestrator.md
index 28893a9..b892122 100644
--- a/.claude/agents/orchestrator.md
+++ b/.claude/agents/orchestrator.md
@@ -1,7 +1,7 @@
 ---
 name: orchestrator
 description: Senior Tech Lead — decomposes tasks, selects specialist agents, packages context, manages handoff chains. Invoke for any non-trivial task.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
 model: opus
 ---
 
@@ -150,6 +150,33 @@ For every task, you reason from first principles:
 - No task-type templates — "a frontend feature always needs Frontend Architect + UI/UX Designer + Frontend QA" is WRONG. Maybe this feature is a one-line config change. Reason about the actual task.
 - Minimum viable team — start small, inject more agents if their outputs reveal the need
 
+## Frontend-Last Phasing Rule
+
+When a plan includes **Frontend Architect** or **Frontend QA**, and ALSO includes any of the following, the frontend agents MUST run in a later phase:
+
+| Run BEFORE frontend | Why |
+|---|---|
+| **Backend Architect** | Frontend needs finalized API contracts, response shapes, endpoint paths |
+| **DB Architect** | Schema decisions affect what data is available to the frontend |
+| **UI/UX Designer** | Frontend needs interaction specs, visual direction, component behavior |
+| **Design Auditor** | Design token / component compliance rules inform frontend implementation |
+
+**How to apply:**
+- Phase 1: Backend Architect, DB Architect, UI/UX Designer, Design Auditor (whichever are needed)
+- Phase 2: Frontend Architect, Frontend QA (receive Phase 1 outputs as context)
+- If only frontend agents are needed (no backend/design dependency), they run in Phase 1 as normal
+- This rule applies to the SAME task — if frontend and backend are working on unrelated aspects, they can parallelize
+
+This prevents the common failure mode where Frontend Architect designs a component tree before knowing the API contract or design specs, then must redo work after handoff results arrive.
+
+**Context injection into frontend prompts:** When dispatching frontend agents in Phase 2, include relevant outputs from Phase 1 agents in their prompt:
+- From **Backend Architect**: API endpoint paths, response schemas, error codes, auth requirements
+- From **DB Architect**: data model shapes, available fields, relationship structures
+- From **UI/UX Designer**: interaction specs, component behavior, visual direction, layout decisions
+- From **Design Auditor**: token compliance rules, component reuse requirements, accessibility constraints
+
+Summarize each Phase 1 output to its key decisions (max ~200 words per agent) — do not dump full outputs. The frontend agent needs actionable specs, not raw analysis.
+
 # Adaptive Context Injection
 
 After each agent returns results, analyze their output for signals that warrant additional specialists. This is reactive — you inject agents based on what was ACTUALLY discovered, not what you predicted.
@@ -313,6 +340,24 @@ SPECIALIST MEMORY TO INCLUDE:
 - What other agents are working on in parallel (so they can flag cross-cutting concerns)
 - What deliverable you need back from them
 
+# Subagents for Research
+
+Use these subagents to gather context before building your dispatch pipeline. They keep research output out of your main context window.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Quick scan of affected files, module structure, directory layout — enough to scope the task |
+| `feature-dev:code-explorer` | Sonnet | Deep analysis when task scope is unclear — trace features, map dependencies, understand complexity |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="List all files in cofee_backend/cpv3/modules/[module]/ and cofee_frontend/src/features/[domain]/. Thoroughness: quick")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace how [feature] works across frontend, backend, and remotion service. Map the cross-service boundaries and API contracts involved.")
+```
+
+Use `Explore` for most scoping tasks. Use `feature-dev:code-explorer` only when the task touches unfamiliar areas or has unclear blast radius.
+
 # Research Protocol
 
 Your research is high-level and scoping-focused. You are mapping the terrain, not exploring caves.
diff --git a/.claude/agents/product-strategist.md b/.claude/agents/product-lead.md
similarity index 91%
rename from .claude/agents/product-strategist.md
rename to .claude/agents/product-lead.md
index 31546e0..b277b23 100644
--- a/.claude/agents/product-strategist.md
+++ b/.claude/agents/product-lead.md
@@ -1,7 +1,7 @@
 ---
-name: product-strategist
-description: Senior Product/Growth Lead — SaaS monetization, conversion optimization, feature prioritization, competitive analysis, growth mechanics.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan
+name: product-lead
+description: Senior Product Lead — SaaS monetization, conversion optimization, feature prioritization, competitive analysis, growth mechanics. Coordinator for the Product sub-team.
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan
 model: opus
 ---
 
@@ -14,7 +14,7 @@ At the very start of every invocation:
    This contains the project context, team roster, handoff format, and quality standards.
 
 2. Read your memory directory:
-   Read directory: `.claude/agents-memory/product-strategist/`
+   Read directory: `.claude/agents-memory/product-lead/`
    List all files and read each one. Check for findings relevant to the current task — previous market research, pricing decisions, competitor intelligence, growth experiments.
 
 3. Read the relevant CLAUDE.md files based on the task scope:
@@ -27,6 +27,32 @@ At the very start of every invocation:
 
 ---
 
+# Hierarchy
+
+- **Lead:** Orchestrator (direct report)
+- **Tier:** 1 (Lead)
+- **Sub-team:** Product
+- **Manages:** UI/UX Designer, Technical Writer, ML/AI Engineer
+
+## Dual-Mode Operation
+
+You operate in two modes, signaled by the orchestrator via `MODE:` in the dispatch context:
+
+**Coordinator mode** (default, when `MODE: coordinator` or MODE omitted): Decompose the task for your sub-team, dispatch the right specialists, synthesize results. Act as a manager — scoping, dispatching, synthesizing. Do NOT do deep product analysis yourself.
+
+**Specialist mode** (when `MODE: specialist`): Answer as a product/growth specialist directly. Do NOT dispatch your sub-team. Used when the orchestrator needs your specific product expertise, not coordination.
+
+## Coordinator Responsibilities
+
+When in coordinator mode:
+1. Receive a scoped product/growth sub-task from the orchestrator
+2. Analyze which specialists are needed
+3. Dispatch specialists with packaged context
+4. Synthesize specialist outputs into a unified recommendation
+5. Report back with synthesized result + audit trail
+
+Follow the dispatch protocol defined in the team protocol.
+
 # Identity
 
 You are a **Senior Product/Growth Lead** with 15+ years of experience building and scaling SaaS products from zero to millions in ARR. You have led product strategy at video tooling startups, growth at creator-economy platforms, and monetization at B2C SaaS companies. You have launched freemium products that hit 10% free-to-paid conversion (3x industry average), designed pricing pages that increased ARPU 40%, and built growth loops that reduced CAC to near zero for organic channels.
@@ -491,7 +517,7 @@ If I receive handoff results, I will:
 ## Reading Memory
 
 At the START of every invocation:
-1. Read your memory directory: `.claude/agents-memory/product-strategist/`
+1. Read your memory directory: `.claude/agents-memory/product-lead/`
 2. List all files and read each one
 3. Check for findings relevant to the current task — previous market research, pricing decisions, competitor intelligence, growth experiments
 4. Apply relevant memory entries immediately — do not re-research what past invocations already validated
@@ -500,7 +526,7 @@ At the START of every invocation:
 
 At the END of every invocation, if you discovered non-obvious market or product insights:
 
-1. Write a memory file to `.claude/agents-memory/product-strategist/<date>-<topic>.md`
+1. Write a memory file to `.claude/agents-memory/product-lead/<date>-<topic>.md`
 2. Keep it short (5-15 lines), actionable, and specific to YOUR domain
 3. Include an "Applies when:" line so future you knows when to recall it
 4. Do NOT save general SaaS knowledge — only Coffee Project-specific insights
@@ -566,6 +592,24 @@ When you need another agent's expertise, include this in your output:
 
 If you have no handoffs, omit the Handoff Requests section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main analysis.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Map feature surface area, find pricing/quota logic, understand current capabilities |
+| `feature-dev:code-explorer` | Sonnet | Understand how a feature is implemented to assess complexity, monetization potential |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all pricing, quota, subscription, and tier-related code across the monorepo. Thoroughness: very thorough")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace how [feature] works end-to-end — from user action through backend processing to result delivery. Map the cost drivers (API calls, compute, storage).")
+```
+
+Include your strategic context in prompts so subagents focus on business-relevant implementation details.
+
 ## Quality Standard
 
 Your output must be:
diff --git a/.claude/agents/technical-writer.md b/.claude/agents/technical-writer.md
index 2332535..e88d2ae 100644
--- a/.claude/agents/technical-writer.md
+++ b/.claude/agents/technical-writer.md
@@ -1,7 +1,7 @@
 ---
 name: technical-writer
 description: Senior Technical Writer — feature documentation, API docs, architecture decision records, concise and scannable documentation.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs
 model: opus
 ---
 
@@ -450,6 +450,24 @@ When you need another agent's expertise, include this in your output:
 
 If you have no handoffs, omit the handoff section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main analysis.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Find code to document, existing docs, API endpoints, module structure |
+| `feature-dev:code-explorer` | Sonnet | Deeply understand a feature's implementation for accurate, detailed documentation |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all API router files and their endpoint definitions in cofee_backend/cpv3/modules/. Also find any existing documentation files. Thoroughness: medium")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace how [feature] works from user action to completion. Map entry points, data transformations, error cases, and configuration points — I need this for documentation.")
+```
+
+Include documentation goals in prompts so subagents highlight what matters for the reader.
+
 ## Common Collaboration Patterns
 
 - **Feature documentation** — you draft the doc, handoff technical accuracy review to the relevant Architect, integrate their corrections
diff --git a/.claude/agents/ui-ux-designer.md b/.claude/agents/ui-ux-designer.md
index 4e742b6..ba3554a 100644
--- a/.claude/agents/ui-ux-designer.md
+++ b/.claude/agents/ui-ux-designer.md
@@ -1,7 +1,7 @@
 ---
 name: ui-ux-designer
 description: Senior Product Designer — visual design, interaction patterns, premium SaaS aesthetics, addictive UX, conversion-oriented design.
-tools: Read, Grep, Glob, Bash, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan
+tools: Read, Grep, Glob, Bash, Agent, WebSearch, WebFetch, mcp__context7__resolve-library-id, mcp__context7__query-docs, mcp__claude-in-chrome__tabs_context_mcp, mcp__claude-in-chrome__tabs_create_mcp, mcp__claude-in-chrome__navigate, mcp__claude-in-chrome__computer, mcp__claude-in-chrome__read_page, mcp__claude-in-chrome__find, mcp__claude-in-chrome__form_input, mcp__claude-in-chrome__get_page_text, mcp__claude-in-chrome__javascript_tool, mcp__claude-in-chrome__read_console_messages, mcp__claude-in-chrome__read_network_requests, mcp__claude-in-chrome__resize_window, mcp__claude-in-chrome__gif_creator, mcp__claude-in-chrome__upload_image, mcp__claude-in-chrome__shortcuts_execute, mcp__claude-in-chrome__shortcuts_list, mcp__claude-in-chrome__switch_browser, mcp__claude-in-chrome__update_plan
 model: opus
 ---
 
@@ -379,6 +379,24 @@ You are part of a 16-agent team. Refer to the shared protocol (`.claude/agents-s
 
 If you have no handoffs, omit the Handoff Requests section entirely.
 
+## Subagents
+
+Dispatch specialized subagents via the Agent tool for focused work outside your main design work.
+
+| Subagent | Model | When to use |
+|----------|-------|-------------|
+| `Explore` | Haiku (fast) | Find current UI patterns, component library, existing page layouts, Radix Themes usage |
+| `feature-dev:code-explorer` | Sonnet | Understand how existing interactions are implemented before proposing new patterns |
+
+### Usage
+
+```
+Agent(subagent_type="Explore", prompt="Find all modal components, form patterns, and page layouts in cofee_frontend/src/. Map which Radix Themes components are used and how. Thoroughness: very thorough")
+Agent(subagent_type="feature-dev:code-explorer", prompt="Trace the user interaction flow for [existing feature] — from trigger through each UI state to completion. Map loading, error, empty, and success states.")
+```
+
+Include your design context in prompts so subagents focus on patterns relevant to your recommendations.
+
 ---
 
 # Output Standards
diff --git a/.claude/rules/agent-pipeline.md b/.claude/rules/agent-pipeline.md
new file mode 100644
index 0000000..775ec41
--- /dev/null
+++ b/.claude/rules/agent-pipeline.md
@@ -0,0 +1,27 @@
+# Agent Pipeline — Mandatory
+
+## The Rule
+
+This project has a 16-agent specialist team (`.claude/agents/`). For ANY non-trivial task — bug hunt, code review, feature, audit, optimization, research — you MUST consult with the developer team by dispatching the orchestrator and the specialist agents it selects.
+
+Built-in agents (e.g. `feature-dev:code-reviewer`, `feature-dev:code-explorer`) may be used alongside the team, but the project's specialist agents must always be consulted.
+
+## Pipeline
+
+1. **Announce** what you're doing: "Consulting with the developer team to [task description]"
+2. **Dispatch the orchestrator** agent with your analysis of the task
+3. **Follow the orchestrator's pipeline** — dispatch the specialists it selects, in the phases it defines
+4. Built-in agents can run in parallel with the specialist team when useful
+5. **Report results** — synthesize all outputs into a coherent response, crediting which specialists contributed
+
+## Announcement Format
+
+Always start with a brief announcement before dispatching agents:
+
+> Consulting with the developer team: dispatching [Agent 1], [Agent 2], [Agent 3] to [task summary].
+
+This tells the user which specialists are working and on what.
+
+## Why
+
+The specialist agents have project-specific context, MCP tools (Postgres, Redis, Docker, Chrome, Lighthouse), memory directories, handoff protocols, and the team protocol for consistent quality. Consulting them ensures domain-expert analysis alongside any built-in agent work.
diff --git a/.gitignore b/.gitignore
index 84ff30f..077acc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,6 @@ remotion_service/
 
 # Claude plugins cache
 .claude/plugins/
+
+# Superpowers brainstorm sessions
+.superpowers/
diff --git a/CLAUDE.md b/CLAUDE.md
index cf5bfd6..1787172 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -121,6 +121,16 @@ All user-facing UI text **must be in Russian**. The only exception is the brand
 This project has a team of 16 specialist agents (15 specialists + 1 Orchestrator).
 Agent files: `.claude/agents/`. Shared protocol: `.claude/agents-shared/team-protocol.md`.
 
+### Developer Team Consultation
+
+For ANY non-trivial task, you MUST consult with the developer team:
+
+1. **Announce**: "Consulting with the developer team to [task summary]"
+2. Dispatch the `orchestrator` agent with your analysis — it selects the right specialists
+3. Built-in agents (code-reviewer, code-explorer, etc.) may be used alongside the team,
+   but the project's specialist agents must always be consulted
+4. **Credit specialists** in your final response — state which agents contributed
+
 ### When to Use the Orchestrator
 
 For ANY non-trivial task (feature, bug fix, audit, optimization, research, infrastructure,
@@ -133,6 +143,20 @@ review, documentation), you MUST:
 Skip the Orchestrator ONLY for trivial tasks: rename a variable, fix a typo, answer a
 quick factual question.
 
+### Frontend-Last Phasing
+
+When a plan includes frontend agents (Frontend Architect, Frontend QA) AND backend/design
+agents, always run backend/design first:
+- **Phase 1**: Backend Architect, DB Architect, UI/UX Designer, Design Auditor
+- **Phase 2**: Frontend Architect, Frontend QA (with Phase 1 outputs as context)
+
+Frontend depends on API contracts from backend and specs from design. Running them later
+prevents rework. If only frontend agents are needed, they run in Phase 1 normally.
+
+When dispatching frontend agents in Phase 2, include relevant Phase 1 outputs in their
+prompt: API contracts, response schemas, data model shapes, interaction specs, design
+constraints. Summarize each to key decisions (~200 words max), not raw output.
+
 ### Dispatch Loop
 
 After receiving the Orchestrator's plan:
diff --git a/docs/bug-reports/2026-03-22_bugreport.html b/docs/bug-reports/2026-03-22_bugreport.html
new file mode 100644
index 0000000..ee33d8a
--- /dev/null
+++ b/docs/bug-reports/2026-03-22_bugreport.html
@@ -0,0 +1,682 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Cofee Project — Bug Audit Report (2026-03-22)</title>
+<style>
+  :root {
+    --bg: #0d1117;
+    --surface: #161b22;
+    --surface-2: #1c2129;
+    --border: #30363d;
+    --text: #e6edf3;
+    --text-muted: #8b949e;
+    --accent: #58a6ff;
+    --critical: #f85149;
+    --critical-bg: rgba(248, 81, 73, 0.1);
+    --high: #f0883e;
+    --high-bg: rgba(240, 136, 62, 0.1);
+    --medium: #d29922;
+    --medium-bg: rgba(210, 153, 34, 0.1);
+    --low: #8b949e;
+    --low-bg: rgba(139, 148, 158, 0.08);
+    --green: #3fb950;
+    --green-bg: rgba(63, 185, 80, 0.1);
+  }
+
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
+    background: var(--bg);
+    color: var(--text);
+    line-height: 1.6;
+    padding: 2rem;
+  }
+
+  .container {
+    max-width: 1200px;
+    margin: 0 auto;
+  }
+
+  h1 {
+    font-size: 2rem;
+    font-weight: 600;
+    margin-bottom: 0.5rem;
+    color: var(--text);
+  }
+
+  h2 {
+    font-size: 1.5rem;
+    font-weight: 600;
+    margin-top: 2.5rem;
+    margin-bottom: 1rem;
+    padding-bottom: 0.5rem;
+    border-bottom: 1px solid var(--border);
+    color: var(--text);
+  }
+
+  h3 {
+    font-size: 1.1rem;
+    font-weight: 600;
+    margin-top: 1.5rem;
+    margin-bottom: 0.5rem;
+    color: var(--text-muted);
+  }
+
+  .subtitle {
+    color: var(--text-muted);
+    font-size: 0.95rem;
+    margin-bottom: 2rem;
+  }
+
+  .summary-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+    gap: 1rem;
+    margin: 1.5rem 0;
+  }
+
+  .summary-card {
+    background: var(--surface);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 1.2rem;
+    text-align: center;
+  }
+
+  .summary-card .count {
+    font-size: 2.5rem;
+    font-weight: 700;
+    line-height: 1;
+  }
+
+  .summary-card .label {
+    font-size: 0.85rem;
+    color: var(--text-muted);
+    margin-top: 0.4rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+  }
+
+  .count-critical { color: var(--critical); }
+  .count-high { color: var(--high); }
+  .count-medium { color: var(--medium); }
+  .count-low { color: var(--low); }
+  .count-total { color: var(--accent); }
+
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 1rem 0;
+    font-size: 0.9rem;
+  }
+
+  th {
+    background: var(--surface);
+    text-align: left;
+    padding: 0.75rem 1rem;
+    font-weight: 600;
+    color: var(--text-muted);
+    border-bottom: 2px solid var(--border);
+    position: sticky;
+    top: 0;
+    z-index: 1;
+  }
+
+  td {
+    padding: 0.75rem 1rem;
+    border-bottom: 1px solid var(--border);
+    vertical-align: top;
+  }
+
+  tr:hover td {
+    background: var(--surface-2);
+  }
+
+  .badge {
+    display: inline-block;
+    padding: 0.15em 0.6em;
+    border-radius: 12px;
+    font-size: 0.75rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.03em;
+    white-space: nowrap;
+  }
+
+  .badge-critical { background: var(--critical-bg); color: var(--critical); border: 1px solid var(--critical); }
+  .badge-high { background: var(--high-bg); color: var(--high); border: 1px solid var(--high); }
+  .badge-medium { background: var(--medium-bg); color: var(--medium); border: 1px solid var(--medium); }
+  .badge-low { background: var(--low-bg); color: var(--low); border: 1px solid var(--low); }
+
+  .area-badge {
+    display: inline-block;
+    padding: 0.15em 0.5em;
+    border-radius: 4px;
+    font-size: 0.75rem;
+    font-weight: 500;
+    background: var(--surface);
+    border: 1px solid var(--border);
+    color: var(--text-muted);
+  }
+
+  code {
+    font-family: 'SF Mono', 'Fira Code', 'Fira Mono', Menlo, Consolas, monospace;
+    font-size: 0.85em;
+    background: var(--surface);
+    padding: 0.15em 0.4em;
+    border-radius: 4px;
+    border: 1px solid var(--border);
+    color: var(--accent);
+    word-break: break-all;
+  }
+
+  .section-description {
+    color: var(--text-muted);
+    margin-bottom: 1rem;
+    font-size: 0.95rem;
+  }
+
+  .quick-wins {
+    background: var(--green-bg);
+    border: 1px solid var(--green);
+    border-radius: 8px;
+    padding: 1.5rem;
+    margin: 1.5rem 0;
+  }
+
+  .quick-wins h3 {
+    color: var(--green);
+    margin-top: 0;
+    font-size: 1.1rem;
+  }
+
+  .quick-wins table {
+    margin-bottom: 0;
+  }
+
+  .quick-wins td, .quick-wins th {
+    border-color: rgba(63, 185, 80, 0.2);
+  }
+
+  .agents-section {
+    background: var(--surface);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 1.5rem;
+    margin: 1.5rem 0;
+  }
+
+  .agents-section h3 {
+    margin-top: 0;
+    color: var(--accent);
+  }
+
+  .agents-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+    gap: 0.75rem;
+    margin-top: 1rem;
+  }
+
+  .agent-card {
+    background: var(--surface-2);
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    padding: 0.75rem 1rem;
+    font-size: 0.9rem;
+  }
+
+  .agent-card strong {
+    color: var(--accent);
+  }
+
+  .agent-card .agent-count {
+    float: right;
+    color: var(--text-muted);
+    font-size: 0.85rem;
+  }
+
+  .theme-list {
+    list-style: none;
+    padding: 0;
+  }
+
+  .theme-list li {
+    padding: 0.5rem 0;
+    border-bottom: 1px solid var(--border);
+    color: var(--text-muted);
+    font-size: 0.9rem;
+  }
+
+  .theme-list li:last-child {
+    border-bottom: none;
+  }
+
+  .theme-list code {
+    font-size: 0.8em;
+  }
+
+  .cross-confirm {
+    font-size: 0.85rem;
+    color: var(--text-muted);
+    margin-top: 1rem;
+  }
+
+  .cross-confirm li {
+    margin-bottom: 0.4rem;
+  }
+
+  footer {
+    margin-top: 3rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid var(--border);
+    color: var(--text-muted);
+    font-size: 0.85rem;
+    text-align: center;
+  }
+
+  @media (max-width: 768px) {
+    body { padding: 1rem; }
+    .summary-grid { grid-template-columns: repeat(2, 1fr); }
+    .agents-grid { grid-template-columns: 1fr; }
+    table { font-size: 0.8rem; }
+    td, th { padding: 0.5rem; }
+  }
+</style>
+</head>
+<body>
+<div class="container">
+
+<h1>Cofee Project — Bug Audit Report</h1>
+<p class="subtitle">
+  Date: 2026-03-22 &nbsp;|&nbsp;
+  Audited by: Backend Architect, Frontend Architect, Remotion Engineer, DB Architect, Security Auditor, Performance Engineer &nbsp;|&nbsp;
+  ~90 unique issues after deduplication
+</p>
+
+<div class="summary-grid">
+  <div class="summary-card">
+    <div class="count count-total">~90</div>
+    <div class="label">Total Issues</div>
+  </div>
+  <div class="summary-card">
+    <div class="count count-critical">10</div>
+    <div class="label">Critical</div>
+  </div>
+  <div class="summary-card">
+    <div class="count count-high">20</div>
+    <div class="label">High</div>
+  </div>
+  <div class="summary-card">
+    <div class="count count-medium">30+</div>
+    <div class="label">Medium</div>
+  </div>
+  <div class="summary-card">
+    <div class="count count-low">30+</div>
+    <div class="label">Low</div>
+  </div>
+</div>
+
+<!-- ============================================================ -->
+<h2>Critical — Fix Immediately</h2>
+<p class="section-description">These issues can cause security breaches, data loss, or application crashes under normal usage.</p>
+
+<table>
+  <thead>
+    <tr><th>#</th><th>Area</th><th>Issue</th><th>File(s)</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>1</td>
+      <td><span class="badge badge-critical">Security</span></td>
+      <td><strong>Path traversal</strong> — any authenticated user can read arbitrary server files via <code>../../etc/passwd</code>. The endpoint resolves the path but never validates it stays within the storage directory.</td>
+      <td><code>files/router.py:103</code></td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td><span class="badge badge-critical">Security</span></td>
+      <td><strong>Unauthenticated webhook</strong> — <code>POST /api/tasks/webhook/{job_id}/</code> has no auth. Anyone can forge job status, inject arbitrary output data, or mark jobs as failed.</td>
+      <td><code>tasks/router.py:195</code></td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td><span class="badge badge-critical">Security</span></td>
+      <td><strong>JWT in JS-accessible cookies</strong> — tokens set via <code>js-cookie</code> with no HttpOnly/Secure/SameSite flags. Any XSS steals both access and refresh tokens.</td>
+      <td><code>useCookie.tsx</code>, <code>LoginPage.tsx:36</code></td>
+    </tr>
+    <tr>
+      <td>4</td>
+      <td><span class="badge badge-critical">Security</span></td>
+      <td><strong>PyJWT CVE-2026-32597</strong> — active vulnerability in the core auth library. Fix available in v2.12.0.</td>
+      <td><code>pyproject.toml</code></td>
+    </tr>
+    <tr>
+      <td>5</td>
+      <td><span class="area-badge">Frontend</span></td>
+      <td><strong>No token refresh</strong> — when access token expires, all API calls fail with opaque <code>"Oops, fetch failed"</code>. Refresh token is set during login but never used again.</td>
+      <td><code>shared/api/index.ts:27</code></td>
+    </tr>
+    <tr>
+      <td>6</td>
+      <td><span class="area-badge">Frontend</span></td>
+      <td><strong>setState during render</strong> — <code>setCaptionedVideoFileId()</code> and <code>setStatus()</code> called outside useEffect, causing infinite re-render loops that freeze the browser tab.</td>
+      <td><code>CaptionResultStep.tsx:69</code>, <code>ConvertMediaView.tsx:51</code></td>
+    </tr>
+    <tr>
+      <td>7</td>
+      <td><span class="area-badge">Frontend</span></td>
+      <td><strong>Workspace state race condition</strong> — WizardProvider and WorkspaceProvider independently PATCH <code>workspace_state</code>, overwriting each other's data on the 1000ms debounce boundary.</td>
+      <td><code>WizardContext.tsx:345</code>, <code>WorkspaceContext.tsx:111</code></td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td><span class="area-badge">Backend</span></td>
+      <td><strong>Auth session closed prematurely</strong> — <code>get_current_user</code> closes its DB session in <code>finally</code>, leaving the returned User object detached. Any lazy-loaded relationship access causes <code>DetachedInstanceError</code>.</td>
+      <td><code>infrastructure/auth.py:62</code></td>
+    </tr>
+    <tr>
+      <td>9</td>
+      <td><span class="area-badge">Remotion</span></td>
+      <td><strong>Custom fonts never loaded</strong> — only Lobster is loaded at module level. Any other <code>font_family</code> in styleConfig silently renders with system sans-serif.</td>
+      <td><code>Captions.tsx:3,12</code></td>
+    </tr>
+    <tr>
+      <td>10</td>
+      <td><span class="area-badge">Performance</span></td>
+      <td><strong>Sequential S3 frame uploads</strong> — 300 frames uploaded one-at-a-time (30s of round-trip time). Should use <code>asyncio.gather()</code> with semaphore (~3s).</td>
+      <td><code>media/service.py:497</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<!-- ============================================================ -->
+<h2>High — Fix This Sprint</h2>
+<p class="section-description">Significant bugs affecting security, correctness, or user experience. Not immediately exploitable or crash-inducing but need prompt attention.</p>
+
+<h3>Security</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>No refresh token rotation</strong> — stolen token grants permanent access for 30 days with no revocation mechanism</td>
+      <td><code>users/router.py:211</code></td>
+    </tr>
+    <tr>
+      <td><strong>Remotion has zero authentication</strong> — port 3001 exposed, enables SSRF via <code>callbackUrl</code></td>
+      <td><code>server/index.ts:22</code></td>
+    </tr>
+    <tr>
+      <td><strong>IDOR on artifacts/transcriptions/events</strong> — any authenticated user reads/modifies anyone's data (<code>_ = current_user</code>)</td>
+      <td><code>media/router.py:205</code>, <code>transcription/router.py:30</code>, <code>jobs/router.py:106</code></td>
+    </tr>
+    <tr>
+      <td><strong>No rate limiting</strong> on login/register — unlimited brute force</td>
+      <td><code>users/router.py:176</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<h3>Backend</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>Token refresh skips user validation</strong> — deactivated users keep generating new access tokens</td>
+      <td><code>users/router.py:211</code></td>
+    </tr>
+    <tr>
+      <td><strong>Repository update drops explicit None</strong> — impossible to clear nullable fields via PATCH (affects 7 repos)</td>
+      <td><code>jobs/repository.py:78</code> + 6 others</td>
+    </tr>
+    <tr>
+      <td><strong>Routers bypass service layer</strong> — media, transcription, notification routers use repositories directly</td>
+      <td><code>media/router.py:128</code>, <code>transcription/router.py:36</code>, <code>notifications/router.py:63</code></td>
+    </tr>
+    <tr>
+      <td><strong>TaskService couples to 6 cross-module repos</strong> — bypasses business rules in other modules</td>
+      <td><code>tasks/service.py:26</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<h3>Frontend</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>Zero error boundaries</strong> — any JS error crashes the entire app to a blank white screen</td>
+      <td><code>app/</code> (no <code>error.tsx</code> anywhere)</td>
+    </tr>
+    <tr>
+      <td><strong>WebSocket token in URL query string</strong> — logged by proxies and browser history</td>
+      <td><code>SocketProvider.tsx:209</code></td>
+    </tr>
+    <tr>
+      <td><strong>Raw fetch() bypasses auth middleware</strong> — 3 notification endpoints use manual cookies</td>
+      <td><code>NotificationPopup.tsx:84,94</code>, <code>SocketProvider.tsx:156</code></td>
+    </tr>
+    <tr>
+      <td><strong>FSD layer violation</strong> — feature imports from widget layer</td>
+      <td><code>SubtitleRevisionStep.tsx:24</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<h3>Database</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>Missing FK indexes on notifications</strong> — <code>job_id</code>, <code>project_id</code> cause full sequential scans</td>
+      <td><code>notifications/models.py:18</code></td>
+    </tr>
+    <tr>
+      <td><strong>No pagination</strong> on 8 of 9 list endpoints — unbounded queries load entire tables</td>
+      <td>All <code>repository.py list_all()</code> methods</td>
+    </tr>
+    <tr>
+      <td><strong>No CHECK constraints on status columns</strong> — typo in status string = invisible orphaned row</td>
+      <td><code>jobs/models.py</code>, <code>projects/models.py</code>, <code>notifications/models.py</code></td>
+    </tr>
+    <tr>
+      <td><strong><code>files.path</code> queried without index</strong> — sequential scan on every file lookup by path</td>
+      <td><code>files/repository.py:36</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<h3>Remotion</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>No retry on DONE/FAILED webhook</strong> — missed webhook = user's job stuck forever in "running"</td>
+      <td><code>webhook.ts:13</code></td>
+    </tr>
+    <tr>
+      <td><strong>Empty transcription silently renders with no captions</strong> — wasted compute, confusing UX</td>
+      <td><code>useCaptions.ts:27</code></td>
+    </tr>
+    <tr>
+      <td><strong>Sync render path has no concurrency limit</strong> — N requests spawn N Chromium processes, causes OOM</td>
+      <td><code>server/index.ts:42</code></td>
+    </tr>
+    <tr>
+      <td><strong><code>out/</code> directory not created at startup</strong> — first render fails outside Docker with ENOENT</td>
+      <td><code>render_video.ts:134</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<h3>Performance</h3>
+<table>
+  <thead><tr><th>Issue</th><th>File(s)</th></tr></thead>
+  <tbody>
+    <tr>
+      <td><strong>New psycopg2 connection per cancellation check</strong> — 5-20ms overhead + connection churn in Dramatiq</td>
+      <td><code>tasks/service.py:224</code></td>
+    </tr>
+    <tr>
+      <td><strong>No GZip middleware</strong> — transcription JSON (100KB+) sent uncompressed to frontend</td>
+      <td><code>main.py</code></td>
+    </tr>
+    <tr>
+      <td><strong>WizardContext subscribes to full notification store</strong> — entire wizard re-renders every 3 seconds during task processing</td>
+      <td><code>WizardContext.tsx:353</code></td>
+    </tr>
+  </tbody>
+</table>
+
+<!-- ============================================================ -->
+<h2>Medium — Fix Next Sprint</h2>
+<p class="section-description">Suboptimal patterns, technical debt, and issues that compound under load or scale.</p>
+
+<ul class="theme-list">
+  <li><strong>Inconsistent soft-delete</strong> — <code>is_active</code> (BaseModelMixin) vs <code>is_deleted</code> (files, media) on different tables. Some tables have both columns.</li>
+  <li><strong>No password complexity requirements</strong> — users can register with password <code>"a"</code> <span style="color:var(--text-muted)">(<code>users/schemas.py</code>)</span></li>
+  <li><strong>Connection pool defaults too small</strong> — 5+10=15 max; production with 4 workers needs 60+ <span style="color:var(--text-muted)">(<code>settings.py:44</code>)</span></li>
+  <li><strong>Redis connection never closed on shutdown</strong> — singleton created lazily, no lifespan cleanup <span style="color:var(--text-muted)">(<code>notifications/service.py:44</code>)</span></li>
+  <li><strong>No explicit session rollback on failure</strong> — uncommitted state can leak between requests <span style="color:var(--text-muted)">(<code>db/session.py:44</code>)</span></li>
+  <li><strong>Multiple DB commits per webhook callback</strong> — 7+ commits with no atomicity, partial saves on failure <span style="color:var(--text-muted)">(<code>tasks/service.py:1158</code>)</span></li>
+  <li><strong>SSR QueryClient singleton</strong> — module-level <code>new QueryClient()</code> leaks cache between server requests <span style="color:var(--text-muted)">(<code>shared/lib/query_client.ts</code>)</span></li>
+  <li><strong>Unused npm dependencies</strong> — lodash, axios, xior = ~85KB dead weight in bundle <span style="color:var(--text-muted)">(<code>package.json</code>)</span></li>
+  <li><strong>Redundant 2s polling alongside WebSocket</strong> — 30 API requests/min per active wizard, WebSocket already delivers same data <span style="color:var(--text-muted)">(<code>WizardContext.tsx:361</code>)</span></li>
+  <li><strong>All JSON columns should be JSONB</strong> — 10 columns use plain JSON, can't be indexed or queried efficiently <span style="color:var(--text-muted)">(all <code>models.py</code>)</span></li>
+  <li><strong>No <code>server_default</code> on BaseModelMixin</strong> — direct SQL/migrations bypass Python-side defaults <span style="color:var(--text-muted)">(<code>db/base.py:20</code>)</span></li>
+  <li><strong>S3 filename collision</strong> — re-rendering same video overwrites previous captioned version <span style="color:var(--text-muted)">(<code>remotion_service/server/services/s3.ts:76</code>)</span></li>
+  <li><strong><code>lines_per_screen</code> and <code>animation_speed</code> accepted but never used</strong> — schema promises features that don't exist <span style="color:var(--text-muted)">(<code>CaptionStyleSchema.ts</code>)</span></li>
+  <li><strong>Default JWT secret "dev-secret"</strong> — no production guard prevents deployment with guessable secret <span style="color:var(--text-muted)">(<code>settings.py:29</code>)</span></li>
+  <li><strong>No file content type validation on upload</strong> — extension/MIME/magic bytes not checked <span style="color:var(--text-muted)">(<code>files/router.py:39</code>)</span></li>
+  <li><strong>API <code>onError</code> swallows error details</strong> — all errors become <code>"Oops, fetch failed"</code>, impossible to distinguish 401/404/500 <span style="color:var(--text-muted)">(<code>shared/api/index.ts:49</code>)</span></li>
+  <li><strong>Irreversible migration downgrade</strong> — <code>b3c4d5e6f7a8</code> downgrade crashes with NOT NULL violation <span style="color:var(--text-muted)">(<code>alembic/versions/</code>)</span></li>
+  <li><strong><code>project_pct</code> column misnaming</strong> — DB says "project" but API says "progress", confusing mapping <span style="color:var(--text-muted)">(<code>jobs/models.py:34</code>, <code>notifications/service.py:143</code>)</span></li>
+  <li><strong>No ORM relationships defined</strong> — zero <code>relationship()</code> across 11 models, traps future N+1 patterns <span style="color:var(--text-muted)">(all <code>models.py</code>)</span></li>
+  <li><strong>Double audio file loading</strong> — <code>detect_silence</code> decodes the same file twice, doubling memory and time <span style="color:var(--text-muted)">(<code>media/service.py:86</code>)</span></li>
+  <li><strong><code>StorageService.get_file_info</code> makes 3 sequential S3 calls</strong> — could be 1 <code>head_object</code> <span style="color:var(--text-muted)">(<code>storage/base.py:88</code>)</span></li>
+  <li><strong>Token logged to server console</strong> — <code>console.log("Verifying token:", token)</code> in server action <span style="color:var(--text-muted)">(<code>server.ts:16</code>)</span></li>
+  <li><strong>framer-motion in critical path</strong> — 32KB gzipped for 2 components, should use CSS animations <span style="color:var(--text-muted)">(<code>Loader.tsx</code>, <code>HomePage.tsx</code>)</span></li>
+  <li><strong>Additional dependency CVEs</strong> — protobuf, pyasn1, python-multipart have known fixes available <span style="color:var(--text-muted)">(<code>pyproject.toml</code>)</span></li>
+  <li><strong>Webhook secrets exposed in API response</strong> — <code>WebhookRead</code> includes plaintext <code>secret</code> field <span style="color:var(--text-muted)">(<code>webhooks/schemas.py:16</code>)</span></li>
+  <li><strong>No request timing middleware</strong> — can't detect performance regressions <span style="color:var(--text-muted)">(<code>main.py</code>)</span></li>
+  <li><strong>Redis SCAN in cancellation cleanup</strong> — O(n) over entire keyspace instead of direct key lookup <span style="color:var(--text-muted)">(<code>tasks/service.py:1062</code>)</span></li>
+  <li><strong>No <code>/health</code> endpoint on Remotion service</strong> — Docker/K8s probes have nothing to hit <span style="color:var(--text-muted)">(<code>server/index.ts</code>)</span></li>
+  <li><strong>TranscriptionEditor callback churn</strong> — <code>handleSave</code> recreated on every keystroke <span style="color:var(--text-muted)">(<code>TranscriptionEditor.tsx:124</code>)</span></li>
+  <li><strong>No numeric bounds on Remotion schema fields</strong> — negative <code>font_size</code>, <code>fade_duration_frames</code> can crash renderer <span style="color:var(--text-muted)">(<code>CaptionStyleSchema.ts</code>, <code>DocumentSchema.ts</code>)</span></li>
+</ul>
+
+<!-- ============================================================ -->
+<h2>Low — Technical Debt</h2>
+<p class="section-description">Code quality issues, missing conventions, and minor inefficiencies.</p>
+
+<ul class="theme-list">
+  <li>Inline error strings instead of <code>ERROR_</code> constants (all routers)</li>
+  <li>Inconsistent <code>is_active</code>/<code>is_deleted</code> semantics (some models have both columns)</li>
+  <li>19 <code>console.log</code>/<code>console.error</code> statements in production frontend code</li>
+  <li>Missing <code>data-testid</code> on 18 of 21 shared UI components</li>
+  <li>No Content-Security-Policy or security headers on frontend</li>
+  <li>OpenAPI/Swagger docs exposed unconditionally (even in production)</li>
+  <li>Redis without authentication in Docker Compose</li>
+  <li>Default DB credentials <code>postgres/postgres</code> with no production guard</li>
+  <li>MinIO default credentials <code>minioadmin/minioadmin</code></li>
+  <li><code>email</code> column has no unique constraint</li>
+  <li>Webhook secrets stored as plaintext in DB</li>
+  <li><code>broker_id</code> on jobs has no index</li>
+  <li>Duplicate <code>json</code> import in <code>media/service.py</code></li>
+  <li><code>formatBytes</code> duplicated in 3 Remotion files</li>
+  <li><code>GET /api/render</code> returns bare string "Hello" (debug leftover)</li>
+  <li><code>justifyContent</code> uses "left"/"right" instead of "flex-start"/"flex-end" in Remotion</li>
+  <li>Module-level mutable <code>regionIdCounter</code> shared across component instances</li>
+  <li><code>FragmentsStep</code> component is 843 lines (guideline: 150 max)</li>
+  <li>Login page shows no error message to user on failure</li>
+  <li><code>.env</code> not in backend <code>.gitignore</code></li>
+  <li><code>useBreadcrumbs</code> uses <code>JSON.stringify</code> in dependency array</li>
+  <li><code>BreadcrumbsProvider</code> context value not memoized</li>
+  <li><code>TranscriptionModal</code> passes <code>queryKey</code> in wrong argument position</li>
+  <li>Only <code>SIGTERM</code> handled in Remotion, not <code>SIGINT</code></li>
+  <li>Short <code>removeOnFail</code> TTL (2h) makes debugging failed renders difficult</li>
+</ul>
+
+<!-- ============================================================ -->
+<div class="quick-wins">
+  <h3>Top 5 Quick Wins (highest impact, lowest effort)</h3>
+  <table>
+    <thead><tr><th>Fix</th><th>Effort</th><th>Impact</th></tr></thead>
+    <tbody>
+      <tr>
+        <td>Path traversal guard — add 3-line <code>is_relative_to()</code> check</td>
+        <td>5 min</td>
+        <td>Blocks arbitrary file read (Critical security fix)</td>
+      </tr>
+      <tr>
+        <td>Add <code>GZipMiddleware</code> — single line in <code>main.py</code></td>
+        <td>2 min</td>
+        <td>5-10x smaller JSON responses</td>
+      </tr>
+      <tr>
+        <td>Parallelize S3 frame uploads — <code>asyncio.gather()</code> + semaphore</td>
+        <td>30 min</td>
+        <td>10-60s saved per frame extraction job</td>
+      </tr>
+      <tr>
+        <td>Remove unused npm packages (lodash, axios, xior)</td>
+        <td>5 min</td>
+        <td>~85KB bundle size reduction</td>
+      </tr>
+      <tr>
+        <td>Fix setState-during-render — wrap in <code>useEffect</code></td>
+        <td>10 min</td>
+        <td>Prevents browser tab freezes</td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+
+<!-- ============================================================ -->
+<div class="agents-section">
+  <h3>Contributing Agents</h3>
+  <div class="agents-grid">
+    <div class="agent-card">
+      <strong>Backend Architect</strong> <span class="agent-count">25 findings</span><br>
+      API logic, race conditions, service layer patterns, error handling
+    </div>
+    <div class="agent-card">
+      <strong>Frontend Architect</strong> <span class="agent-count">24 findings</span><br>
+      React/Next.js bugs, state management, FSD compliance, type safety
+    </div>
+    <div class="agent-card">
+      <strong>Remotion Engineer</strong> <span class="agent-count">22 findings</span><br>
+      Render pipeline, S3 integration, caption edge cases, webhook reliability
+    </div>
+    <div class="agent-card">
+      <strong>DB Architect</strong> <span class="agent-count">23 findings</span><br>
+      Schema issues, missing indexes, migration risks, query patterns
+    </div>
+    <div class="agent-card">
+      <strong>Security Auditor</strong> <span class="agent-count">22 findings</span><br>
+      OWASP audit, auth/JWT, IDOR, SSRF, dependency CVEs, scanning tools
+    </div>
+    <div class="agent-card">
+      <strong>Performance Engineer</strong> <span class="agent-count">25 findings</span><br>
+      Async blocking, S3 throughput, connection pools, re-renders, bundle size
+    </div>
+  </div>
+
+  <ul class="cross-confirm">
+    <li><strong>Path traversal</strong> and <strong>unauthenticated webhook</strong> — confirmed independently by Backend Architect + Security Auditor</li>
+    <li><strong>Missing pagination</strong> — flagged by Backend Architect + DB Architect + Performance Engineer</li>
+    <li><strong>Inconsistent soft-delete</strong> — flagged by Backend Architect + DB Architect</li>
+    <li><strong>IDOR on artifacts/transcriptions</strong> — flagged by Backend Architect + DB Architect + Security Auditor</li>
+    <li><strong>WizardContext re-renders</strong> — flagged by Frontend Architect + Performance Engineer</li>
+  </ul>
+</div>
+
+<footer>
+  Generated by Claude Code agent team (Orchestrator + 6 specialists) on 2026-03-22
+</footer>
+
+</div>
+</body>
+</html>
diff --git a/docs/consults/video-features-roadmap_v1.md b/docs/consults/video-features-roadmap_v1.md
new file mode 100644
index 0000000..2373177
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v1.md
@@ -0,0 +1,416 @@
+# Video Features Roadmap — Technical Consultation v1
+
+**Date:** 2026-03-22
+**Specialists consulted:** ML/AI Engineer, Backend Architect, Remotion Engineer, Frontend Architect, DevOps Engineer, Performance Engineer
+
+---
+
+## Feature Overview
+
+| # | Feature | Complexity | MVP | Full | Additional Infra |
+|---|---------|-----------|-----|------|-----------------|
+| 1 | Advanced Remotion Templates | Easy-Medium | 3-4 days | 3-4 days | None — ready to implement |
+| 2 | Viral Moments Detection | Medium | 5-7 days | 8-12 days | LLM API key only |
+| 3 | Auto-Cut & Head Tracking | Very Hard | 12-15 days | 30-45 days | Phase 1: nothing; Phase 2: GPU worker |
+| 4 | 9:16 Shorts Conversion | Medium | 6-8 days | +3-4 days after #3 | None |
+| **Total** | | | **26-34 days** | **44-65 days** | |
+
+Realistic for one dev: **6-8 weeks** (all MVPs) or **3-4 months** (full versions).
+
+---
+
+## Feature 1: Advanced Remotion Templates
+
+**Status:** Spec + implementation plan already written.
+
+- Spec: `docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md`
+- Plan: `docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md`
+
+**Scope:** Extend `CaptionStyleSchema` with 4 new highlight styles (pop_in, karaoke, bounce, glow_pulse), 2 transitions (zoom_in, drop_in), 3 fields (word_entrance, highlight_rotation_deg, text_transform). Seed 2 system presets: "Shorts" and "Podcast".
+
+**Changes:** Schema extensions in Remotion + backend, rendering logic in `Captions.tsx`, Alembic migration for presets, frontend StyleEditor form controls.
+
+**No specialist input needed** — fully designed, no new infrastructure.
+
+---
+
+## Feature 2: Viral Moments Detection
+
+### Architecture
+
+**LLM API:** Gemini 2.5 Flash (best Russian language support, $0.15/$0.60 per 1M tokens) or GPT-4o-mini (same pricing, slightly weaker Russian). Cost per 30-min video analysis: ~$0.005.
+
+**Audio augmentation:** `librosa` for RMS energy curves — refines clip boundaries to natural pauses, boosts scoring for high-energy segments. Adds ~20MB dependency, processes 30-min audio in <10 seconds.
+
+**Pipeline:**
+1. Fetch transcription Document from DB
+2. librosa computes energy envelope over full audio (100ms resolution)
+3. LLM analyzes transcription text with structured JSON output prompt
+4. Post-process: snap clip boundaries to low-energy points, compute energy scores
+5. Save clips to new `clips` table
+
+### Backend Design
+
+**New module:** `clips` (models, schemas, repository, service, router) — stores detected clips with project/file/job relationships.
+
+**Clip model:**
+```
+Clip {
+  project_id: UUID (FK projects)
+  source_file_id: UUID (FK files)
+  job_id: UUID? (FK jobs)
+  title: str
+  start_ms: int
+  end_ms: int
+  score: float
+  source_type: "viral_detected" | "user_created" | "auto_generated"
+  status: "pending" | "approved" | "rejected" | "exported"
+  meta: JSON? (LLM reasoning, tags, hashtags)
+}
+```
+
+**New job type:** `VIRAL_DETECT` added to `JobTypeEnum`. Actor calls LLM API directly via `httpx` from Dramatiq worker (no separate service needed).
+
+**LLM integration:**
+- Direct HTTP call from actor with retry + exponential backoff on 429
+- Prompts stored in `cpv3/infrastructure/prompts/viral_detection_v1.txt`
+- Active version controlled by `LLM_VIRAL_PROMPT_VERSION` env var
+- New settings: `LLM_API_URL`, `LLM_API_KEY`, `LLM_MODEL_NAME`
+
+### Frontend Design
+
+- New `ViralClipsStep` in project wizard (features/project/)
+- Clip list with thumbnails, scores, titles, approve/reject buttons
+- Clip edit modal with video preview (scoped playback for start/end range)
+- New job type `VIRAL_DETECT` in notification handling (existing WebSocket infrastructure)
+
+### Key Numbers
+
+| Metric | Value |
+|---|---|
+| Accuracy (precision) | 50-70% |
+| Accuracy (recall) | 60-80% |
+| Processing time | 10-20 seconds |
+| Cost per video | ~$0.005 |
+| Cost at 1,000 videos/month | ~$5 |
+| New dependencies | `google-generativeai` or `openai` (~10MB) + `librosa` (~20MB) |
+
+### Risks
+
+- **Prompt engineering quality** determines feature value — iterate based on user feedback
+- **Visual-only moments** (facial expressions, physical comedy) cannot be detected from text — ~20-30% of viral moments are missed
+- **Transcription quality matters** — Whisper `tiny` has ~25% WER on Russian; use at least `small` for viral detection input
+- **LLM hallucinated timestamps** — validate returned timestamps against actual segment boundaries
+
+### MVP vs Full
+
+- **MVP:** Text-only LLM analysis, no audio energy. Returns clips with scores. User reviews and accepts/rejects.
+- **Full:** Add librosa energy analysis, few-shot prompt examples from user-accepted clips, batch processing, direct clip export to 9:16.
+
+---
+
+## Feature 3: Auto-Cut & Head Tracking
+
+### Architecture
+
+**Face detection:** MediaPipe BlazeFace (Apache 2.0, ~2MB model, 30-60 FPS on CPU). Sample at 3 FPS — face positions don't change significantly within 330ms. Dependency: `mediapipe` (~30MB).
+
+**Speaker diarization:** pyannote.audio 3.1 (MIT, ~10% DER, self-hosted). Runs on CPU at 0.17-0.33x real-time (5-10 min for 30-min audio). GPU accelerates to 1-2 min. Dependencies: `pyannote-audio` (~200MB) + `torchaudio` (~50-80MB). PyTorch already installed via Whisper.
+
+**Face-speaker mapping:**
+- Phase 1: Temporal correlation heuristic — match face tracks to speaker segments by maximum temporal overlap. 70-85% accuracy for 2-speaker videos. Zero additional dependencies. ~100 lines of Python.
+- Phase 2: TalkNet-ASD (Active Speaker Detection) — jointly analyzes lip movement + audio to detect who is speaking. 92.3% accuracy. Requires `torchvision` + model weights (~50MB). Needs GPU (2-5 FPS on CPU vs 15-25 FPS on GPU).
+
+**Video compositing (Remotion approach):**
+
+Dynamic crop via CSS `transform: scale() translate()` on `<Video>` element inside `overflow: hidden` container. This is a GPU-composited browser operation — essentially free performance-wise. No FFmpeg re-encoding needed for the crop itself.
+
+**New Remotion compositions:**
+
+| Composition | Purpose | Phase |
+|---|---|---|
+| `CaptionedVideo` (existing) | Caption overlay on native video | Current |
+| `ShortsVideo` (new) | Static/keyframe crop + captions at 9:16 | Feature 4 |
+| `AutoEditVideo` (new) | Face-tracking crop + cuts + captions | Feature 3 full |
+
+All compositions share the `<Captions>` component and `useCaptions` hook.
+
+**Crop data format (keyframes):**
+```typescript
+type FaceKeyframe = {
+  time: number;       // seconds
+  x: number;          // center of face, 0.0-1.0 normalized
+  y: number;          // center of face, 0.0-1.0 normalized
+  width: number;      // bounding box width, 0.0-1.0
+  height: number;     // bounding box height, 0.0-1.0
+  speakerId?: string;
+};
+
+type CropTrack = {
+  keyframes: FaceKeyframe[];
+  interpolation: "linear" | "ease" | "smooth";
+  zoom: number;       // base zoom multiplier
+  safeMargin: number; // margin around face (0.1 = 10%)
+};
+```
+
+Remotion `interpolate()` between keyframes for smooth pan/zoom. Use `spring()` only for hard cuts between speakers.
+
+### Backend Design
+
+**New job types:** `FACE_DETECT`, `SPEAKER_DIARIZE` added to `JobTypeEnum`. Results stored in `Job.output_data` (JSON) — no new table needed for face/diarization data.
+
+**ML service separation:**
+- Phase 1: Keep in Dramatiq workers (same image). MediaPipe + pyannote add only ~280MB to image.
+- Phase 2: Separate `ml-worker` Docker container on dedicated Dramatiq queues (`ml_head_tracking`, `ml_diarization`). Same codebase, different image, different resource limits.
+
+**Remotion service changes:** `POST /api/render` needs a `compositionId` request parameter to select which composition to render. Props extend with `crop`, `outputWidth`, `outputHeight`.
+
+### Processing Time (30-min 1080p video)
+
+| Step | CPU | GPU |
+|---|---|---|
+| Audio extraction (FFmpeg) | 10-20 sec | 10-20 sec |
+| Face detection (MediaPipe, 3 FPS) | 1-2 min | 10-15 sec |
+| Speaker diarization (pyannote) | **15-30 min** | 1-2 min |
+| Face-speaker mapping | < 1 sec | < 1 sec |
+| Remotion render (crop + captions) | 10-30 min | 10-30 min |
+| **Total (parallelized)** | **35-80 min** | **16-40 min** |
+
+Face detection + diarization can run in parallel (different input: video frames vs audio track).
+
+### Memory Requirements
+
+| Config | Peak RAM |
+|---|---|
+| Whisper base + pyannote (parallel) | 8-12 GB |
+| Whisper medium + pyannote (parallel) | 12-16 GB |
+| Recommended ML worker limit | 16 GB, `--threads 1` |
+
+### Frontend Design
+
+- Head tracking preview: video player with face bounding box overlay (canvas)
+- Speaker timeline track in TimelinePanel (extends existing 4-track system)
+- Controls: zoom level slider, transition speed, speaker selection
+- Before/after comparison toggle
+- UX flow: upload podcast → trigger analysis (ProcessingStep) → review speaker assignments → adjust → export
+
+### Key Numbers
+
+| Metric | Value |
+|---|---|
+| Face detection accuracy | ~90% (MediaPipe on talking-head content) |
+| Diarization DER | ~10% (pyannote 3.1) |
+| Face-speaker mapping (Phase 1) | 70-85% accuracy |
+| Face-speaker mapping (Phase 2, TalkNet) | ~92% accuracy |
+| New dependencies | ~280MB (mediapipe + pyannote + torchaudio) |
+| GPU mandatory? | No for Phase 1; recommended for Phase 2 |
+
+### Risks
+
+- **Face-to-speaker mapping** is the hardest unsolved subproblem — 70-85% accuracy means 1 in 5 assignments may be wrong. Must let users manually correct.
+- **Diarization on CPU** is the bottleneck — 15-30 min for 30-min video. GPU reduces to 1-2 min.
+- **PyTorch version conflicts** between Whisper and pyannote — test `uv sync` before committing.
+- **Video quality loss** when cropping 16:9 to 9:16 — only ~31.6% of frame width is kept. Source must be at least 1080p.
+- **Model download on first run** — pyannote models (~100MB) require Hugging Face license acceptance. Handle in Dockerfile, not at runtime.
+
+### MVP vs Full
+
+- **MVP (12-15 days):** Face detection on sampled frames. User manually selects which face to follow. Static crop to selected face. No speaker switching, no diarization. Works for single-speaker content.
+- **Full (30-45 days):** Speaker diarization + face-speaker mapping. Dynamic crop following active speaker. Smooth spring() transitions on speaker changes. Split-screen for reactions. Multi-speaker support.
+
+---
+
+## Feature 4: 9:16 Shorts Conversion
+
+### Architecture
+
+**Pipeline:** Crop-then-caption, always. Single Remotion render pass using new `ShortsVideo` composition. The composition renders at target 9:16 dimensions, applies CSS crop transform to `<Video>`, and renders captions on top.
+
+**Caption positioning:** No new schema fields needed. Backend adjusts `font_size`, `padding_px`, `max_width_pct` in `styleConfig` for 9:16 aspect ratio. Remotion is a "dumb renderer" — intelligence about what looks good at 9:16 belongs in presets.
+
+**Crop specification:**
+```typescript
+type CropConfig = {
+  mode: "static" | "keyframe";
+  staticCrop?: { x: number; y: number; zoom: number };  // 0-1 normalized
+  keyframes?: Array<{ time: number; x: number; y: number; zoom: number }>;
+  interpolation?: "linear" | "ease" | "smooth";
+};
+```
+
+Static crop is a degenerate case of keyframe crop (single keyframe).
+
+### Backend Design
+
+**New job type:** `ASPECT_CONVERT` in `JobTypeEnum`. New function `crop_to_vertical()` in `media/service.py` using FFmpeg crop+scale filter.
+
+**New artifact type:** `VERTICAL_VIDEO` in `ArtifactTypeEnum`.
+
+**Pipeline:**
+1. Trim source video to clip time range (if from viral detection)
+2. Apply crop (static center crop or face-tracking crop from Feature 3)
+3. Upload to S3 at `{folder}/vertical/{filename}`
+4. Webhook + notification
+
+### Frontend Design
+
+- Crop preview: draggable 9:16 rectangle overlay on video player (CSS `object-fit: cover` + `object-position`)
+- Side-by-side preview toggle: original 16:9 vs cropped 9:16
+- Integration with Feature 2: "Convert to Short" button on each approved viral clip
+- Integration with Feature 3: auto-populate crop region from face detection data
+
+### Processing Time
+
+| Approach | Time (30-min video) |
+|---|---|
+| FFmpeg crop-only (no captions) | 12-36 min |
+| Remotion crop + captions (single pass) | 11-45 min |
+| FFmpeg with NVENC hardware encoding | 3-5 min |
+
+### MVP vs Full
+
+- **MVP (6-8 days):** Manual crop region selection with preview. User drags a 9:16 rectangle over video. New `ShortsVideo` Remotion composition renders crop + captions.
+- **Full (+3-4 days after Feature 3):** Auto-crop based on face detection data. One-click vertical conversion. Batch conversion of viral clips.
+
+---
+
+## Recommended Build Order
+
+```
+Week 1-2:    Feature 1 (Templates)        ████████
+Week 2-4:    Feature 2 (Viral Detection)  ████████████████
+Week 4-6:    Feature 4 MVP (9:16 crop)    ████████████████
+Week 6-14:   Feature 3 (Head Tracking)    ████████████████████████████████████████
+Week 14-15:  Feature 4 upgrade            ████████
+```
+
+**Rationale:**
+1. **Templates first** — ready to implement, zero risk, immediate user value
+2. **Viral detection second** — highest value/effort ratio ($0.005/video, 5-7 days MVP), validates that users want automated editing
+3. **9:16 MVP third** — builds the `ShortsVideo` composition that Feature 3 extends, useful standalone with manual crop
+4. **Head tracking last** — most complex, biggest investment, validates demand from Features 2+4 first
+5. **9:16 upgrade** — trivial once head tracking provides face position data
+
+---
+
+## Cost Analysis
+
+### Per-Video Processing Cost
+
+| Tier | Components | Compute | LLM API | Total | Wait Time |
+|---|---|---|---|---|---|
+| CPU-only | All on CPU | $0.05 | $0.06 | **$0.11** | 35-80 min |
+| GPU (T4) | ML on GPU, FFmpeg on CPU | $0.11 | $0.06 | **$0.17** | 16-40 min |
+| GPU + NVENC | Everything on GPU | $0.13 | $0.06 | **$0.19** | 10-15 min |
+
+### Monthly Infrastructure Cost (100 videos/month)
+
+| Scenario | Cost |
+|---|---|
+| CPU-only (existing infra) | ~$11 + server |
+| Modal serverless GPU | ~$21/month |
+| Spot GPU (g4dn.xlarge) | ~$115/month |
+| Standing GPU (g4dn.xlarge 24/7) | ~$380/month |
+
+**Recommendation:** Start CPU-only. Move to Modal serverless GPU when queue wait times exceed 15 minutes. At 500+ videos/day, evaluate spot instances.
+
+### Suggested SaaS Pricing Tiers
+
+| Tier | Price | Limits | Compute Cost | Margin |
+|---|---|---|---|---|
+| Free | $0 | 10-min videos, queue priority low | ~$0.04/video | Marketing |
+| Pro | $15-30/mo | 30-min videos, GPU ML | ~$0.17/video at 50 videos | 60-80% |
+| Business | $50-100/mo | 60-min videos, priority queue, NVENC | ~$0.38/video | 70-85% |
+
+---
+
+## Infrastructure Decisions
+
+### ML Service Separation
+
+**Phase 1:** Keep ML in existing Dramatiq workers. MediaPipe + pyannote add only ~280MB to image. PyTorch is already installed via Whisper.
+
+**Phase 2:** Separate `ml-worker` Docker container on dedicated queues. Same codebase, different image (`Dockerfile.ml`), different resource limits. Use Docker Compose profiles:
+
+```bash
+docker-compose up                    # Default: no ML worker
+docker-compose --profile ml up       # With ML worker
+```
+
+**Do NOT build a separate HTTP microservice.** Dramatiq already handles job queuing, retries, progress, and cancellation. Adding HTTP service discovery, API contracts, and health checks is overhead with zero benefit for async workloads.
+
+### Immediate Optimizations (Before New Features)
+
+| Action | Impact | Effort |
+|---|---|---|
+| Switch PyTorch to CPU-only index | -800MB image size | 1 hour |
+| Fix worker `REMOTION_SERVICE_URL` default | Bug fix | 5 min |
+| Add resource limits to docker-compose services | Prevent OOM cascades | 30 min |
+| Split Dramatiq into queue pools (lightweight vs ML vs compute) | Prevent worker starvation | 2-3 hours |
+
+---
+
+## Technology Stack Summary
+
+### New Dependencies
+
+| Package | Size | Purpose | Feature |
+|---|---|---|---|
+| `google-generativeai` or `openai` | ~10 MB | LLM API client | 2 |
+| `librosa` | ~20 MB | Audio energy analysis | 2 |
+| `mediapipe` | ~30 MB | Face detection | 3 |
+| `pyannote-audio` | ~200 MB | Speaker diarization | 3 |
+| `torchaudio` | ~50-80 MB | Audio processing for pyannote | 3 |
+| **Total new deps** | **~310-340 MB** | | |
+
+### New Backend Modules
+
+| Module | Purpose | Feature |
+|---|---|---|
+| `clips` | Clip CRUD, review workflow | 2 |
+
+### New Remotion Compositions
+
+| Composition | Purpose | Feature |
+|---|---|---|
+| `ShortsVideo` | Static/keyframe crop + captions at 9:16 | 4 |
+| `AutoEditVideo` | Face-tracking dynamic crop + captions | 3 |
+
+### New Job Types
+
+| Job Type | Purpose | Feature |
+|---|---|---|
+| `VIRAL_DETECT` | LLM analysis of transcription | 2 |
+| `ASPECT_CONVERT` | 9:16 crop + re-encode | 4 |
+| `FACE_DETECT` | Face bounding box detection | 3 |
+| `SPEAKER_DIARIZE` | Speaker diarization | 3 |
+
+---
+
+## Cross-Cutting Issues
+
+| Issue | Flagged By | Priority | Action |
+|---|---|---|---|
+| PyTorch installs CUDA libs on CPU-only infra (+800MB) | DevOps | High | Switch to CPU-only PyTorch index |
+| Worker `--processes 1 --threads 2` will OOM with ML jobs | Performance | High | Split into queue pools, `--threads 1` for ML |
+| `_get_job_status_sync()` leaks DB connections | Performance | High | Fix before adding more actors |
+| No temp file cleanup on OOM crash | Performance | Medium | Add periodic `/tmp` cleanup or cron |
+| `tasks/service.py` at 1,674 lines, will exceed 2K | Backend | Medium | Extract actor boilerplate into decorator/context manager |
+| Worker `REMOTION_SERVICE_URL` default wrong (`localhost:8001`) | DevOps | Medium | Fix to `http://remotion:3001` in docker-compose |
+| No resource limits on any Docker service | DevOps | Medium | Add memory/CPU limits to all services |
+| Whisper should move to ML service eventually | Backend | Low | Plan for Phase 2 when ML worker is split out |
+| `isCurrent` word identity check in Captions.tsx is fragile | Remotion | Low | Compare by index, not text + start time |
+
+---
+
+## Specialist Reports (Full Transcripts)
+
+Full specialist outputs are available in the session transcript. Key files each specialist examined:
+
+- **ML Engineer:** `cpv3/modules/transcription/service.py`, `cpv3/modules/tasks/service.py`, `pyproject.toml`
+- **Backend Architect:** `cpv3/modules/tasks/service.py`, `cpv3/modules/jobs/schemas.py`, `cpv3/modules/media/service.py`, `cpv3/modules/captions/service.py`, `docker-compose.yml`
+- **Remotion Engineer:** `remotion_service/src/components/Composition.tsx`, `Captions.tsx`, `Root.tsx`, `useCaptions.ts`, `useVideoMeta.ts`, all type definitions
+- **Frontend Architect:** `src/widgets/TimelinePanel/`, `src/features/project/FragmentsStep/`, `src/shared/context/WizardContext.tsx`, `src/shared/store/notifications/`
+- **DevOps Engineer:** `docker-compose.yml`, `Dockerfile`, `pyproject.toml`, `uv.lock`
+- **Performance Engineer:** `cpv3/modules/tasks/service.py`, `cpv3/modules/media/service.py`, `cpv3/modules/transcription/service.py`, `docker-compose.yml`
diff --git a/docs/consults/video-features-roadmap_v1_ru.html b/docs/consults/video-features-roadmap_v1_ru.html
new file mode 100644
index 0000000..7ce9370
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v1_ru.html
@@ -0,0 +1,984 @@
+<!DOCTYPE html>
+<html lang="ru">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Дорожная карта видеофич — Техническая консультация v1</title>
+<style>
+  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap');
+
+  :root {
+    --bg: #0f1117;
+    --bg-card: #161922;
+    --bg-card-hover: #1c2030;
+    --border: #2a2f3e;
+    --text: #e4e6ed;
+    --text-dim: #8b8fa3;
+    --text-heading: #f0f2f7;
+    --accent: #6c5ce7;
+    --accent-light: #a29bfe;
+    --accent-bg: rgba(108, 92, 231, 0.1);
+    --green: #00cec9;
+    --green-bg: rgba(0, 206, 201, 0.1);
+    --yellow: #fdcb6e;
+    --yellow-bg: rgba(253, 203, 110, 0.1);
+    --red: #ff6b6b;
+    --red-bg: rgba(255, 107, 107, 0.1);
+    --blue: #74b9ff;
+    --blue-bg: rgba(116, 185, 255, 0.1);
+    --orange: #e17055;
+    --orange-bg: rgba(225, 112, 85, 0.1);
+  }
+
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+
+  body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+    background: var(--bg);
+    color: var(--text);
+    line-height: 1.7;
+    font-size: 15px;
+    -webkit-font-smoothing: antialiased;
+  }
+
+  .container {
+    max-width: 960px;
+    margin: 0 auto;
+    padding: 60px 32px 120px;
+  }
+
+  /* Header */
+  .hero {
+    text-align: center;
+    margin-bottom: 72px;
+    padding: 64px 0;
+    position: relative;
+  }
+  .hero::before {
+    content: '';
+    position: absolute;
+    top: -60px;
+    left: 50%;
+    transform: translateX(-50%);
+    width: 600px;
+    height: 600px;
+    background: radial-gradient(circle, rgba(108,92,231,0.12) 0%, transparent 70%);
+    pointer-events: none;
+    z-index: 0;
+  }
+  .hero * { position: relative; z-index: 1; }
+  .hero h1 {
+    font-size: 2.6rem;
+    font-weight: 800;
+    color: var(--text-heading);
+    letter-spacing: -0.03em;
+    margin-bottom: 16px;
+    background: linear-gradient(135deg, var(--text-heading) 0%, var(--accent-light) 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+  }
+  .hero .meta {
+    color: var(--text-dim);
+    font-size: 0.9rem;
+    line-height: 1.8;
+  }
+  .hero .meta strong { color: var(--text); font-weight: 500; }
+
+  /* Sections */
+  h2 {
+    font-size: 1.6rem;
+    font-weight: 700;
+    color: var(--text-heading);
+    margin: 64px 0 24px;
+    padding-bottom: 12px;
+    border-bottom: 2px solid var(--border);
+    letter-spacing: -0.02em;
+  }
+  h3 {
+    font-size: 1.15rem;
+    font-weight: 600;
+    color: var(--accent-light);
+    margin: 32px 0 12px;
+  }
+
+  p { margin: 12px 0; color: var(--text); }
+  p.dim { color: var(--text-dim); font-size: 0.9rem; }
+  strong { font-weight: 600; color: var(--text-heading); }
+  em { font-style: italic; color: var(--yellow); }
+
+  /* Feature cards */
+  .feature-card {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 16px;
+    padding: 36px;
+    margin: 24px 0;
+    transition: border-color 0.2s;
+  }
+  .feature-card:hover { border-color: var(--accent); }
+  .feature-card h2 {
+    margin-top: 0;
+    border: none;
+    padding: 0;
+    display: flex;
+    align-items: center;
+    gap: 12px;
+  }
+  .feature-num {
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    width: 36px;
+    height: 36px;
+    border-radius: 10px;
+    font-size: 0.9rem;
+    font-weight: 700;
+    flex-shrink: 0;
+  }
+  .feature-num.f1 { background: var(--green-bg); color: var(--green); }
+  .feature-num.f2 { background: var(--blue-bg); color: var(--blue); }
+  .feature-num.f3 { background: var(--red-bg); color: var(--red); }
+  .feature-num.f4 { background: var(--orange-bg); color: var(--orange); }
+
+  /* Tags */
+  .tag {
+    display: inline-block;
+    padding: 3px 10px;
+    border-radius: 6px;
+    font-size: 0.75rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+  }
+  .tag.easy { background: var(--green-bg); color: var(--green); }
+  .tag.medium { background: var(--yellow-bg); color: var(--yellow); }
+  .tag.hard { background: var(--red-bg); color: var(--red); }
+  .status-badge {
+    display: inline-block;
+    padding: 4px 12px;
+    border-radius: 20px;
+    font-size: 0.8rem;
+    font-weight: 500;
+    background: var(--green-bg);
+    color: var(--green);
+    margin-bottom: 12px;
+  }
+
+  /* Tables */
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 16px 0;
+    font-size: 0.88rem;
+  }
+  thead th {
+    background: rgba(108, 92, 231, 0.08);
+    color: var(--accent-light);
+    font-weight: 600;
+    text-align: left;
+    padding: 12px 16px;
+    border-bottom: 2px solid var(--border);
+    font-size: 0.8rem;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+  }
+  tbody td {
+    padding: 10px 16px;
+    border-bottom: 1px solid var(--border);
+    color: var(--text);
+  }
+  tbody tr:hover { background: var(--bg-card-hover); }
+  tbody tr:last-child td { border-bottom: none; }
+  .table-wrap {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    overflow: hidden;
+    margin: 16px 0;
+  }
+  .table-wrap table { margin: 0; }
+
+  /* Code */
+  code {
+    font-family: 'JetBrains Mono', monospace;
+    background: rgba(108, 92, 231, 0.1);
+    color: var(--accent-light);
+    padding: 2px 7px;
+    border-radius: 5px;
+    font-size: 0.85em;
+  }
+  pre {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 20px 24px;
+    overflow-x: auto;
+    margin: 16px 0;
+    font-size: 0.85rem;
+    line-height: 1.6;
+  }
+  pre code {
+    background: none;
+    padding: 0;
+    color: var(--text);
+  }
+  .keyword { color: var(--accent-light); }
+  .type-name { color: var(--green); }
+  .string-val { color: var(--yellow); }
+  .comment { color: var(--text-dim); }
+
+  /* Lists */
+  ul, ol {
+    margin: 12px 0;
+    padding-left: 24px;
+  }
+  li {
+    margin: 6px 0;
+    color: var(--text);
+  }
+  li::marker { color: var(--accent-light); }
+
+  /* Callout */
+  .callout {
+    border-left: 3px solid;
+    padding: 16px 20px;
+    margin: 20px 0;
+    border-radius: 0 10px 10px 0;
+    font-size: 0.93rem;
+  }
+  .callout.highlight {
+    border-color: var(--accent);
+    background: var(--accent-bg);
+  }
+  .callout.warning {
+    border-color: var(--yellow);
+    background: var(--yellow-bg);
+  }
+  .callout.danger {
+    border-color: var(--red);
+    background: var(--red-bg);
+  }
+  .callout.info {
+    border-color: var(--blue);
+    background: var(--blue-bg);
+  }
+  .callout.success {
+    border-color: var(--green);
+    background: var(--green-bg);
+  }
+
+  /* Overview grid */
+  .overview-grid {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    gap: 16px;
+    margin: 24px 0;
+  }
+  .overview-item {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 24px;
+  }
+  .overview-item .label {
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.06em;
+    color: var(--text-dim);
+    margin-bottom: 4px;
+  }
+  .overview-item .value {
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: var(--text-heading);
+  }
+  .overview-item .value.accent { color: var(--accent-light); }
+  .overview-item .value.green { color: var(--green); }
+  .overview-item .value.yellow { color: var(--yellow); }
+
+  /* Timeline */
+  .timeline {
+    margin: 24px 0;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.82rem;
+  }
+  .timeline-row {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    padding: 8px 0;
+  }
+  .timeline-label {
+    width: 200px;
+    text-align: right;
+    color: var(--text-dim);
+    flex-shrink: 0;
+  }
+  .timeline-bar {
+    height: 28px;
+    border-radius: 6px;
+    display: flex;
+    align-items: center;
+    padding: 0 12px;
+    font-weight: 500;
+    font-size: 0.75rem;
+    color: #fff;
+    white-space: nowrap;
+  }
+
+  /* Section divider */
+  .divider {
+    height: 1px;
+    background: linear-gradient(to right, transparent, var(--border), transparent);
+    margin: 48px 0;
+  }
+
+  /* Risks */
+  .risk-item {
+    display: flex;
+    gap: 12px;
+    padding: 12px 0;
+    border-bottom: 1px solid var(--border);
+  }
+  .risk-item:last-child { border-bottom: none; }
+  .risk-icon {
+    width: 24px;
+    height: 24px;
+    border-radius: 6px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 0.75rem;
+    flex-shrink: 0;
+    margin-top: 2px;
+    background: var(--yellow-bg);
+    color: var(--yellow);
+  }
+
+  /* MVP comparison */
+  .mvp-compare {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 16px;
+    margin: 20px 0;
+  }
+  .mvp-box {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 20px;
+  }
+  .mvp-box h4 {
+    font-size: 0.85rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+    margin-bottom: 10px;
+  }
+  .mvp-box.mvp h4 { color: var(--green); }
+  .mvp-box.full h4 { color: var(--accent-light); }
+  .mvp-box p { font-size: 0.88rem; margin: 0; }
+
+  /* Scrollbar */
+  ::-webkit-scrollbar { width: 6px; height: 6px; }
+  ::-webkit-scrollbar-track { background: transparent; }
+  ::-webkit-scrollbar-thumb { background: var(--border); border-radius: 3px; }
+
+  /* Print */
+  @media print {
+    body { background: #fff; color: #1a1a2e; }
+    .feature-card, .table-wrap, pre, .callout, .overview-item, .mvp-box {
+      background: #f8f9fa;
+      border-color: #dee2e6;
+    }
+  }
+
+  @media (max-width: 640px) {
+    .container { padding: 32px 16px 80px; }
+    .hero h1 { font-size: 1.8rem; }
+    .overview-grid { grid-template-columns: 1fr; }
+    .mvp-compare { grid-template-columns: 1fr; }
+    .timeline-label { width: 120px; font-size: 0.7rem; }
+  }
+</style>
+</head>
+<body>
+<div class="container">
+
+  <!-- Hero -->
+  <div class="hero">
+    <h1>Дорожная карта видеофич</h1>
+    <p class="meta">
+      Техническая консультация v1<br>
+      <strong>22 марта 2026</strong><br><br>
+      ML/AI-инженер &middot; Backend-архитектор &middot; Remotion-инженер<br>
+      Frontend-архитектор &middot; DevOps-инженер &middot; Инженер по производительности
+    </p>
+  </div>
+
+  <!-- Overview -->
+  <h2>Общая картина</h2>
+
+  <div class="overview-grid">
+    <div class="overview-item">
+      <div class="label">Всего фич</div>
+      <div class="value accent">4</div>
+    </div>
+    <div class="overview-item">
+      <div class="label">MVP все фичи</div>
+      <div class="value green">26–34 дня</div>
+    </div>
+    <div class="overview-item">
+      <div class="label">Полные версии</div>
+      <div class="value yellow">44–65 дней</div>
+    </div>
+    <div class="overview-item">
+      <div class="label">Один разработчик</div>
+      <div class="value">6–8 недель</div>
+    </div>
+  </div>
+
+  <div class="table-wrap">
+    <table>
+      <thead>
+        <tr>
+          <th>#</th>
+          <th>Фича</th>
+          <th>Сложность</th>
+          <th>MVP</th>
+          <th>Полная версия</th>
+          <th>Доп. инфраструктура</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr>
+          <td><span class="feature-num f1" style="width:28px;height:28px;font-size:0.8rem;">1</span></td>
+          <td><strong>Продвинутые шаблоны Remotion</strong></td>
+          <td><span class="tag easy">Легко</span></td>
+          <td>3–4 дня</td>
+          <td>3–4 дня</td>
+          <td style="color:var(--green)">Ничего</td>
+        </tr>
+        <tr>
+          <td><span class="feature-num f2" style="width:28px;height:28px;font-size:0.8rem;">2</span></td>
+          <td><strong>Детекция вирусных моментов</strong></td>
+          <td><span class="tag medium">Средне</span></td>
+          <td>5–7 дней</td>
+          <td>8–12 дней</td>
+          <td>API-ключ для LLM</td>
+        </tr>
+        <tr>
+          <td><span class="feature-num f3" style="width:28px;height:28px;font-size:0.8rem;">3</span></td>
+          <td><strong>Авто-монтаж и трекинг лица</strong></td>
+          <td><span class="tag hard">Сложно</span></td>
+          <td>12–15 дней</td>
+          <td>30–45 дней</td>
+          <td>Фаза 2: GPU-воркер</td>
+        </tr>
+        <tr>
+          <td><span class="feature-num f4" style="width:28px;height:28px;font-size:0.8rem;">4</span></td>
+          <td><strong>Конвертация в Shorts (9:16)</strong></td>
+          <td><span class="tag medium">Средне</span></td>
+          <td>6–8 дней</td>
+          <td>+3–4 дня</td>
+          <td style="color:var(--green)">Ничего</td>
+        </tr>
+      </tbody>
+    </table>
+  </div>
+
+  <p class="dim">Реалистичный прогноз для одного разработчика: <strong>6–8 недель</strong> (все MVP) или <strong>3–4 месяца</strong> (полные версии).</p>
+
+  <div class="divider"></div>
+
+  <!-- Feature 1 -->
+  <div class="feature-card">
+    <h2><span class="feature-num f1">1</span> Продвинутые шаблоны Remotion</h2>
+    <span class="status-badge">Спецификация и план готовы</span>
+
+    <p><strong>Что делаем:</strong> Расширяем <code>CaptionStyleSchema</code> четырьмя новыми стилями подсветки слов (<code>pop_in</code>, <code>karaoke</code>, <code>bounce</code>, <code>glow_pulse</code>), двумя переходами (<code>zoom_in</code>, <code>drop_in</code>), тремя полями (<code>word_entrance</code>, <code>highlight_rotation_deg</code>, <code>text_transform</code>). Добавляем два системных пресета: &laquo;Shorts&raquo; и &laquo;Podcast&raquo;.</p>
+
+    <p><strong>Где трогаем код:</strong> Расширение схемы в Remotion + бэкенде, логика рендеринга в <code>Captions.tsx</code>, Alembic-миграция для пресетов, контролы в StyleEditor на фронте.</p>
+
+    <div class="callout success">
+      Особый интерес специалистов не требуется — всё спроектировано, новой инфраструктуры нет. Самая безрисковая фича в этом списке.
+    </div>
+
+    <p class="dim">
+      Спецификация: <code>docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md</code><br>
+      План: <code>docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md</code>
+    </p>
+  </div>
+
+  <!-- Feature 2 -->
+  <div class="feature-card">
+    <h2><span class="feature-num f2">2</span> Детекция вирусных моментов</h2>
+
+    <div class="callout highlight">
+      За <strong>$0.005 за видео</strong> мы можем автоматически находить самые цепляющие фрагменты в подкастах и интервью. Пять копеек — и AI выкладывает тебе на блюдце моменты, которые зрители пересылают друг другу.
+    </div>
+
+    <h3>Архитектура</h3>
+    <p><strong>LLM API:</strong> Gemini 2.5 Flash — лучшая поддержка русского языка, $0.15/$0.60 за 1М токенов. Альтернатива: GPT-4o-mini. Стоимость анализа одного 30-минутного видео: ~$0.005.</p>
+    <p><strong>Аудио-подкрепление:</strong> <code>librosa</code> для кривых RMS-энергии — уточняет границы клипов до естественных пауз, повышает скор для энергичных сегментов. ~20МБ, обработка 30 мин аудио &lt;10 секунд.</p>
+
+    <h3>Пайплайн</h3>
+    <ol>
+      <li>Берём транскрипцию из БД</li>
+      <li><code>librosa</code> считает огибающую энергии (разрешение 100мс)</li>
+      <li>LLM анализирует текст через промпт со structured JSON output</li>
+      <li>Постобработка: привязка границ к точкам низкой энергии, расчёт energy-скоров</li>
+      <li>Сохраняем клипы в новую таблицу <code>clips</code></li>
+    </ol>
+
+    <h3>Бэкенд</h3>
+    <p><strong>Новый модуль:</strong> <code>clips</code> — хранит найденные клипы со связями project / file / job.</p>
+
+    <p><strong>Модель клипа:</strong></p>
+    <pre><code><span class="type-name">Clip</span> {
+  project_id: <span class="type-name">UUID</span> (FK projects)
+  source_file_id: <span class="type-name">UUID</span> (FK files)
+  job_id: <span class="type-name">UUID?</span> (FK jobs)
+  title: <span class="type-name">str</span>
+  start_ms: <span class="type-name">int</span>
+  end_ms: <span class="type-name">int</span>
+  score: <span class="type-name">float</span>
+  source_type: <span class="string-val">"viral_detected"</span> | <span class="string-val">"user_created"</span> | <span class="string-val">"auto_generated"</span>
+  status: <span class="string-val">"pending"</span> | <span class="string-val">"approved"</span> | <span class="string-val">"rejected"</span> | <span class="string-val">"exported"</span>
+  meta: <span class="type-name">JSON?</span> <span class="comment">(рассуждения LLM, теги, хэштеги)</span>
+}</code></pre>
+
+    <p><strong>Новый тип джоба:</strong> <code>VIRAL_DETECT</code> в <code>JobTypeEnum</code>. Актор вызывает LLM API через <code>httpx</code> из Dramatiq-воркера.</p>
+
+    <h3>Фронтенд</h3>
+    <ul>
+      <li>Новый <code>ViralClipsStep</code> в визарде проекта</li>
+      <li>Список клипов с превьюшками, скорами, кнопками принять/отклонить</li>
+      <li>Модалка редактирования клипа с видео-превью</li>
+      <li>Новый тип джоба <code>VIRAL_DETECT</code> в обработке нотификаций</li>
+    </ul>
+
+    <h3>Ключевые цифры</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Метрика</th><th>Значение</th></tr></thead>
+        <tbody>
+          <tr><td>Точность (precision)</td><td>50–70%</td></tr>
+          <tr><td>Полнота (recall)</td><td>60–80%</td></tr>
+          <tr><td>Время обработки</td><td>10–20 секунд</td></tr>
+          <tr><td>Стоимость за видео</td><td style="color:var(--green)">~$0.005</td></tr>
+          <tr><td>1 000 видео/месяц</td><td style="color:var(--green)">~$5</td></tr>
+          <tr><td>Новые зависимости</td><td>~30 МБ</td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <div class="callout info">
+      10–20 секунд и пять долларов за тысячу видео. Вдумайтесь в эти цифры.
+    </div>
+
+    <h3>Риски</h3>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Качество промпт-инжиниринга</strong> определяет ценность фичи — придётся итерировать по фидбеку</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Визуальные моменты</strong> (мимика, физическая комедия) из текста не ловятся — ~20–30% проходят мимо</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Качество транскрипции критично</strong> — Whisper <code>tiny</code> даёт ~25% WER; для вирусной детекции минимум <code>small</code></div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>LLM галлюцинирует таймстемпы</strong> — обязательно валидировать метки времени</div></div>
+
+    <h3>MVP vs Полная версия</h3>
+    <div class="mvp-compare">
+      <div class="mvp-box mvp">
+        <h4>MVP (5–7 дней)</h4>
+        <p>Только текстовый анализ через LLM, без аудио-энергии. Возвращает клипы со скорами. Пользователь ревьюит и принимает/отклоняет.</p>
+      </div>
+      <div class="mvp-box full">
+        <h4>Полная (8–12 дней)</h4>
+        <p>Добавляем librosa-анализ энергии, few-shot примеры из принятых клипов, пакетную обработку, прямой экспорт в 9:16.</p>
+      </div>
+    </div>
+  </div>
+
+  <!-- Feature 3 -->
+  <div class="feature-card">
+    <h2><span class="feature-num f3">3</span> Авто-монтаж и трекинг лица</h2>
+
+    <div class="callout danger">
+      Самая амбициозная фича. Самая сложная. Загружаете подкаст с двумя спикерами — на выходе динамичное вертикальное видео, где камера сама «следит» за говорящим.
+    </div>
+
+    <h3>Архитектура</h3>
+    <p><strong>Детекция лиц:</strong> MediaPipe BlazeFace (Apache 2.0, ~2МБ модель, 30–60 FPS на CPU). Сэмплируем на 3 FPS. Зависимость: <code>mediapipe</code> (~30МБ).</p>
+    <p><strong>Диаризация спикеров:</strong> pyannote.audio 3.1 (MIT, ~10% DER, self-hosted). CPU: 0.17–0.33x реального времени. GPU: 1–2 мин на 30 мин аудио. Зависимости: <code>pyannote-audio</code> (~200МБ) + <code>torchaudio</code> (~50–80МБ).</p>
+
+    <p><strong>Маппинг лицо-спикер:</strong></p>
+    <ul>
+      <li><strong>Фаза 1:</strong> Эвристика по временной корреляции. 70–85% точности для двух спикеров. ~100 строк Python.</li>
+      <li><strong>Фаза 2:</strong> TalkNet-ASD — анализ губ + аудио. 92.3% точности. Нужен GPU.</li>
+    </ul>
+
+    <h3>Видео-композитинг (Remotion)</h3>
+    <p>Динамический кроп через CSS <code>transform: scale() translate()</code> на <code>&lt;Video&gt;</code> внутри контейнера с <code>overflow: hidden</code>. GPU-ускоренная браузерная операция — бесплатная по производительности.</p>
+
+    <h3>Новые Remotion-композиции</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Композиция</th><th>Назначение</th><th>Фаза</th></tr></thead>
+        <tbody>
+          <tr><td><code>CaptionedVideo</code></td><td>Наложение субтитров (существует)</td><td>Текущая</td></tr>
+          <tr><td><code>ShortsVideo</code></td><td>Статический кроп + субтитры в 9:16</td><td>Фича 4</td></tr>
+          <tr><td><code>AutoEditVideo</code></td><td>Кроп с трекингом лица + монтаж + субтитры</td><td>Фича 3</td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <h3>Формат данных кропа</h3>
+    <pre><code><span class="keyword">type</span> <span class="type-name">FaceKeyframe</span> = {
+  time: <span class="type-name">number</span>;       <span class="comment">// секунды</span>
+  x: <span class="type-name">number</span>;          <span class="comment">// центр лица, 0.0–1.0</span>
+  y: <span class="type-name">number</span>;          <span class="comment">// центр лица, 0.0–1.0</span>
+  width: <span class="type-name">number</span>;      <span class="comment">// ширина bbox, 0.0–1.0</span>
+  height: <span class="type-name">number</span>;     <span class="comment">// высота bbox, 0.0–1.0</span>
+  speakerId?: <span class="type-name">string</span>;
+};
+
+<span class="keyword">type</span> <span class="type-name">CropTrack</span> = {
+  keyframes: <span class="type-name">FaceKeyframe</span>[];
+  interpolation: <span class="string-val">"linear"</span> | <span class="string-val">"ease"</span> | <span class="string-val">"smooth"</span>;
+  zoom: <span class="type-name">number</span>;       <span class="comment">// базовый множитель зума</span>
+  safeMargin: <span class="type-name">number</span>; <span class="comment">// отступ вокруг лица (0.1 = 10%)</span>
+};</code></pre>
+
+    <h3>Бэкенд</h3>
+    <p><strong>Новые типы джобов:</strong> <code>FACE_DETECT</code>, <code>SPEAKER_DIARIZE</code>. Результаты хранятся в <code>Job.output_data</code> (JSON).</p>
+    <p><strong>Отделение ML-сервиса:</strong></p>
+    <ul>
+      <li><strong>Фаза 1:</strong> В Dramatiq-воркерах. MediaPipe + pyannote добавляют ~280МБ к образу.</li>
+      <li><strong>Фаза 2:</strong> Отдельный контейнер <code>ml-worker</code> на выделенных очередях Dramatiq.</li>
+    </ul>
+
+    <h3>Время обработки (30-мин 1080p видео)</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Шаг</th><th>CPU</th><th>GPU</th></tr></thead>
+        <tbody>
+          <tr><td>Извлечение аудио (FFmpeg)</td><td>10–20 сек</td><td>10–20 сек</td></tr>
+          <tr><td>Детекция лиц (MediaPipe, 3 FPS)</td><td>1–2 мин</td><td>10–15 сек</td></tr>
+          <tr><td>Диаризация спикеров (pyannote)</td><td style="color:var(--red);font-weight:600">15–30 мин</td><td style="color:var(--green)">1–2 мин</td></tr>
+          <tr><td>Маппинг лицо-спикер</td><td>&lt; 1 сек</td><td>&lt; 1 сек</td></tr>
+          <tr><td>Рендер Remotion</td><td>10–30 мин</td><td>10–30 мин</td></tr>
+          <tr><td><strong>Итого</strong></td><td><strong>35–80 мин</strong></td><td><strong style="color:var(--green)">16–40 мин</strong></td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <h3>Требования к памяти</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Конфигурация</th><th>Пиковое потребление RAM</th></tr></thead>
+        <tbody>
+          <tr><td>Whisper base + pyannote (параллельно)</td><td>8–12 ГБ</td></tr>
+          <tr><td>Whisper medium + pyannote (параллельно)</td><td>12–16 ГБ</td></tr>
+          <tr><td>Рекомендуемый лимит ML-воркера</td><td style="color:var(--yellow)">16 ГБ, <code>--threads 1</code></td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <h3>Фронтенд</h3>
+    <ul>
+      <li>Превью трекинга лица: видеоплеер с наложением bounding box через canvas</li>
+      <li>Трек спикеров в TimelinePanel</li>
+      <li>Контролы: слайдер зума, скорость перехода, выбор спикера</li>
+      <li>Переключатель &laquo;до/после&raquo;</li>
+    </ul>
+
+    <h3>Ключевые цифры</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Метрика</th><th>Значение</th></tr></thead>
+        <tbody>
+          <tr><td>Точность детекции лиц</td><td>~90%</td></tr>
+          <tr><td>DER диаризации</td><td>~10%</td></tr>
+          <tr><td>Маппинг Фаза 1</td><td>70–85%</td></tr>
+          <tr><td>Маппинг Фаза 2 (TalkNet)</td><td style="color:var(--green)">~92%</td></tr>
+          <tr><td>Новые зависимости</td><td>~280 МБ</td></tr>
+          <tr><td>GPU обязателен?</td><td>Нет для Фазы 1</td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <h3>Риски</h3>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Маппинг лицо-спикер</strong> — каждое пятое назначение может быть неверным. Нужна ручная корректировка.</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Диаризация на CPU</strong> — бутылочное горлышко. 15–30 мин на 30-мин видео.</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Конфликты PyTorch</strong> между Whisper и pyannote.</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Потеря качества</strong> при кропе 16:9 → 9:16 — остаётся ~31.6% ширины. Минимум 1080p.</div></div>
+    <div class="risk-item"><div class="risk-icon">!</div><div><strong>Скачивание моделей</strong> pyannote (~100МБ) требует принятия лицензии HF. Обрабатывать в Dockerfile.</div></div>
+
+    <h3>MVP vs Полная версия</h3>
+    <div class="mvp-compare">
+      <div class="mvp-box mvp">
+        <h4>MVP (12–15 дней)</h4>
+        <p>Детекция лиц. Пользователь выбирает лицо вручную. Статический кроп. Без диаризации. Один спикер.</p>
+      </div>
+      <div class="mvp-box full">
+        <h4>Полная (30–45 дней)</h4>
+        <p>Диаризация + маппинг. Динамический кроп за активным спикером. Spring()-переходы. Сплит-скрин. Несколько спикеров.</p>
+      </div>
+    </div>
+  </div>
+
+  <!-- Feature 4 -->
+  <div class="feature-card">
+    <h2><span class="feature-num f4">4</span> Конвертация в вертикальные Shorts (9:16)</h2>
+
+    <h3>Архитектура</h3>
+    <p>Сначала кроп, потом субтитры — всегда. Один проход рендеринга через новую композицию <code>ShortsVideo</code>.</p>
+
+    <p><strong>Спецификация кропа:</strong></p>
+    <pre><code><span class="keyword">type</span> <span class="type-name">CropConfig</span> = {
+  mode: <span class="string-val">"static"</span> | <span class="string-val">"keyframe"</span>;
+  staticCrop?: { x: <span class="type-name">number</span>; y: <span class="type-name">number</span>; zoom: <span class="type-name">number</span> };
+  keyframes?: <span class="type-name">Array</span>&lt;{ time: <span class="type-name">number</span>; x: <span class="type-name">number</span>; y: <span class="type-name">number</span>; zoom: <span class="type-name">number</span> }&gt;;
+  interpolation?: <span class="string-val">"linear"</span> | <span class="string-val">"ease"</span> | <span class="string-val">"smooth"</span>;
+};</code></pre>
+
+    <h3>Бэкенд</h3>
+    <ul>
+      <li><strong>Новый тип джоба:</strong> <code>ASPECT_CONVERT</code></li>
+      <li><strong>Новый тип артефакта:</strong> <code>VERTICAL_VIDEO</code></li>
+      <li>Функция <code>crop_to_vertical()</code> в <code>media/service.py</code></li>
+    </ul>
+
+    <h3>Фронтенд</h3>
+    <ul>
+      <li>Превью кропа: перетаскиваемый прямоугольник 9:16 поверх видеоплеера</li>
+      <li>Side-by-side: оригинал 16:9 vs обрезанное 9:16</li>
+      <li>Интеграция с Фичей 2: кнопка &laquo;Конвертировать в Short&raquo; на каждом клипе</li>
+      <li>Интеграция с Фичей 3: авто-кроп из данных детекции лица</li>
+    </ul>
+
+    <h3>Время обработки</h3>
+    <div class="table-wrap">
+      <table>
+        <thead><tr><th>Подход</th><th>30-мин видео</th></tr></thead>
+        <tbody>
+          <tr><td>FFmpeg кроп (без субтитров)</td><td>12–36 мин</td></tr>
+          <tr><td>Remotion кроп + субтитры</td><td>11–45 мин</td></tr>
+          <tr><td>FFmpeg с NVENC</td><td style="color:var(--green);font-weight:600">3–5 мин</td></tr>
+        </tbody>
+      </table>
+    </div>
+
+    <h3>MVP vs Полная версия</h3>
+    <div class="mvp-compare">
+      <div class="mvp-box mvp">
+        <h4>MVP (6–8 дней)</h4>
+        <p>Ручной выбор кропа. Перетаскиваемый прямоугольник. <code>ShortsVideo</code> рендерит кроп + субтитры.</p>
+      </div>
+      <div class="mvp-box full">
+        <h4>Полная (+3–4 дня)</h4>
+        <p>Авто-кроп из трекинга лица. Конвертация в один клик. Пакетная обработка.</p>
+      </div>
+    </div>
+  </div>
+
+  <div class="divider"></div>
+
+  <!-- Timeline -->
+  <h2>Рекомендуемый порядок разработки</h2>
+
+  <div class="timeline">
+    <div class="timeline-row">
+      <div class="timeline-label">Неделя 1–2</div>
+      <div class="timeline-bar" style="width:15%;background:var(--green);">Шаблоны</div>
+    </div>
+    <div class="timeline-row">
+      <div class="timeline-label">Неделя 2–4</div>
+      <div class="timeline-bar" style="width:30%;background:var(--blue);">Вирусная детекция</div>
+    </div>
+    <div class="timeline-row">
+      <div class="timeline-label">Неделя 4–6</div>
+      <div class="timeline-bar" style="width:30%;background:var(--orange);">9:16 кроп MVP</div>
+    </div>
+    <div class="timeline-row">
+      <div class="timeline-label">Неделя 6–14</div>
+      <div class="timeline-bar" style="width:80%;background:linear-gradient(90deg, var(--red), #c0392b);">Трекинг лица</div>
+    </div>
+    <div class="timeline-row">
+      <div class="timeline-label">Неделя 14–15</div>
+      <div class="timeline-bar" style="width:15%;background:var(--orange);">9:16 апгрейд</div>
+    </div>
+  </div>
+
+  <h3>Почему именно так</h3>
+  <ol>
+    <li><strong>Шаблоны первыми</strong> — готовы к реализации, нулевой риск, моментальная польза</li>
+    <li><strong>Вирусная детекция второй</strong> — лучшее соотношение пользы к трудозатратам ($0.005/видео)</li>
+    <li><strong>9:16 MVP третьим</strong> — создаёт <code>ShortsVideo</code>, которую расширит Фича 3</li>
+    <li><strong>Трекинг лица последним</strong> — самая сложная; к этому моменту спрос уже валидирован</li>
+    <li><strong>Апгрейд 9:16</strong> — тривиален, когда трекинг лица уже даёт позиции</li>
+  </ol>
+
+  <div class="divider"></div>
+
+  <!-- Cost Analysis -->
+  <h2>Анализ стоимости</h2>
+
+  <h3>Стоимость обработки одного видео</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Уровень</th><th>Состав</th><th>Вычисления</th><th>LLM API</th><th>Итого</th><th>Время</th></tr></thead>
+      <tbody>
+        <tr><td>Только CPU</td><td>Всё на CPU</td><td>$0.05</td><td>$0.06</td><td style="color:var(--green);font-weight:700">$0.11</td><td>35–80 мин</td></tr>
+        <tr><td>GPU (T4)</td><td>ML на GPU</td><td>$0.11</td><td>$0.06</td><td>$0.17</td><td>16–40 мин</td></tr>
+        <tr><td>GPU + NVENC</td><td>Всё на GPU</td><td>$0.13</td><td>$0.06</td><td>$0.19</td><td style="color:var(--green)">10–15 мин</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout highlight">
+    Одиннадцать центов на CPU. Девятнадцать с GPU. Меньше двадцати центов за полный пайплайн с AI-анализом, трекингом лица и кодированием видео.
+  </div>
+
+  <h3>Месячная стоимость инфраструктуры (100 видео/мес)</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Сценарий</th><th>Стоимость</th></tr></thead>
+      <tbody>
+        <tr><td>Только CPU (текущая инфра)</td><td>~$11 + сервер</td></tr>
+        <tr><td>Modal serverless GPU</td><td>~$21/мес</td></tr>
+        <tr><td>Spot GPU (g4dn.xlarge)</td><td>~$115/мес</td></tr>
+        <tr><td>Постоянный GPU</td><td>~$380/мес</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout info">
+    <strong>Рекомендация:</strong> Начинаем на CPU. Переходим на Modal serverless GPU, когда время ожидания в очереди превышает 15 минут. При 500+ видео/день — смотрим на spot-инстансы.
+  </div>
+
+  <h3>Предлагаемые тарифы SaaS</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Тариф</th><th>Цена</th><th>Ограничения</th><th>Себестоимость</th><th>Маржа</th></tr></thead>
+      <tbody>
+        <tr><td><strong>Free</strong></td><td>$0</td><td>Видео до 10 мин, низкий приоритет</td><td>~$0.04/видео</td><td style="color:var(--text-dim)">Маркетинг</td></tr>
+        <tr><td><strong>Pro</strong></td><td>$15–30/мес</td><td>Видео до 30 мин, GPU ML</td><td>~$0.17 при 50 видео</td><td style="color:var(--green)">60–80%</td></tr>
+        <tr><td><strong>Business</strong></td><td>$50–100/мес</td><td>Видео до 60 мин, приоритет, NVENC</td><td>~$0.38/видео</td><td style="color:var(--green)">70–85%</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="divider"></div>
+
+  <!-- Infrastructure -->
+  <h2>Инфраструктурные решения</h2>
+
+  <h3>Отделение ML-сервиса</h3>
+  <div class="mvp-compare">
+    <div class="mvp-box mvp">
+      <h4>Фаза 1</h4>
+      <p>ML в Dramatiq-воркерах. MediaPipe + pyannote добавляют ~280МБ. PyTorch уже установлен через Whisper.</p>
+    </div>
+    <div class="mvp-box full">
+      <h4>Фаза 2</h4>
+      <p>Отдельный <code>ml-worker</code> контейнер. Тот же код, другой образ (<code>Dockerfile.ml</code>), другие лимиты ресурсов.</p>
+    </div>
+  </div>
+
+  <pre><code>docker-compose up                    <span class="comment"># По умолчанию: без ML-воркера</span>
+docker-compose --profile ml up       <span class="comment"># С ML-воркером</span></code></pre>
+
+  <div class="callout warning">
+    <strong>НЕ строить отдельный HTTP-микросервис.</strong> Dramatiq уже обеспечивает очередь джобов, ретраи, прогресс и отмену. HTTP service discovery — оверхед с нулевой пользой для асинхронных нагрузок.
+  </div>
+
+  <h3>Немедленные оптимизации</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Действие</th><th>Эффект</th><th>Трудозатраты</th></tr></thead>
+      <tbody>
+        <tr><td>PyTorch на CPU-only индекс</td><td style="color:var(--green)">-800МБ образ</td><td>1 час</td></tr>
+        <tr><td>Исправить <code>REMOTION_SERVICE_URL</code></td><td>Баг-фикс</td><td>5 мин</td></tr>
+        <tr><td>Лимиты ресурсов docker-compose</td><td>Предотвращение каскадных OOM</td><td>30 мин</td></tr>
+        <tr><td>Пулы очередей Dramatiq</td><td>Предотвращение голодания воркеров</td><td>2–3 часа</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <p class="dim">Четыре задачи. Суммарно полдня. Экономия: 800МБ, один баг, и страховка от OOM.</p>
+
+  <div class="divider"></div>
+
+  <!-- Tech Stack Summary -->
+  <h2>Сводка по технологическому стеку</h2>
+
+  <h3>Новые зависимости</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Пакет</th><th>Размер</th><th>Назначение</th><th>Фича</th></tr></thead>
+      <tbody>
+        <tr><td><code>google-generativeai</code> / <code>openai</code></td><td>~10 МБ</td><td>LLM API клиент</td><td><span class="feature-num f2" style="width:22px;height:22px;font-size:0.7rem;">2</span></td></tr>
+        <tr><td><code>librosa</code></td><td>~20 МБ</td><td>Анализ энергии аудио</td><td><span class="feature-num f2" style="width:22px;height:22px;font-size:0.7rem;">2</span></td></tr>
+        <tr><td><code>mediapipe</code></td><td>~30 МБ</td><td>Детекция лиц</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+        <tr><td><code>pyannote-audio</code></td><td>~200 МБ</td><td>Диаризация спикеров</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+        <tr><td><code>torchaudio</code></td><td>~50–80 МБ</td><td>Обработка аудио</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+        <tr><td><strong>Итого</strong></td><td style="color:var(--yellow);font-weight:700">~310–340 МБ</td><td></td><td></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Новые модули, композиции, типы джобов</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Элемент</th><th>Назначение</th><th>Фича</th></tr></thead>
+      <tbody>
+        <tr><td>Модуль <code>clips</code></td><td>CRUD клипов, ревью</td><td><span class="feature-num f2" style="width:22px;height:22px;font-size:0.7rem;">2</span></td></tr>
+        <tr><td>Композиция <code>ShortsVideo</code></td><td>Статический кроп + субтитры 9:16</td><td><span class="feature-num f4" style="width:22px;height:22px;font-size:0.7rem;">4</span></td></tr>
+        <tr><td>Композиция <code>AutoEditVideo</code></td><td>Динамический кроп + субтитры</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+        <tr><td>Джоб <code>VIRAL_DETECT</code></td><td>LLM-анализ транскрипции</td><td><span class="feature-num f2" style="width:22px;height:22px;font-size:0.7rem;">2</span></td></tr>
+        <tr><td>Джоб <code>ASPECT_CONVERT</code></td><td>9:16 кроп</td><td><span class="feature-num f4" style="width:22px;height:22px;font-size:0.7rem;">4</span></td></tr>
+        <tr><td>Джоб <code>FACE_DETECT</code></td><td>Детекция лиц</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+        <tr><td>Джоб <code>SPEAKER_DIARIZE</code></td><td>Диаризация</td><td><span class="feature-num f3" style="width:22px;height:22px;font-size:0.7rem;">3</span></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="divider"></div>
+
+  <!-- Cross-cutting Issues -->
+  <h2>Сквозные проблемы</h2>
+  <p class="dim">Шесть специалистов — шесть взглядов на одну кодовую базу.</p>
+
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема</th><th>Кто</th><th>Приоритет</th><th>Действие</th></tr></thead>
+      <tbody>
+        <tr><td>PyTorch тащит CUDA (+800МБ)</td><td>DevOps</td><td><span class="tag hard" style="font-size:0.65rem;">Высокий</span></td><td>CPU-only PyTorch индекс</td></tr>
+        <tr><td>Воркер упадёт по OOM на ML-джобах</td><td>Performance</td><td><span class="tag hard" style="font-size:0.65rem;">Высокий</span></td><td>Пулы очередей, <code>--threads 1</code></td></tr>
+        <tr><td><code>_get_job_status_sync()</code> течёт соединениями</td><td>Performance</td><td><span class="tag hard" style="font-size:0.65rem;">Высокий</span></td><td>Починить до новых акторов</td></tr>
+        <tr><td>Нет очистки <code>/tmp</code> при OOM</td><td>Performance</td><td><span class="tag medium" style="font-size:0.65rem;">Средний</span></td><td>Периодическая очистка / cron</td></tr>
+        <tr><td><code>tasks/service.py</code> — 1 674 строки</td><td>Backend</td><td><span class="tag medium" style="font-size:0.65rem;">Средний</span></td><td>Декоратор/контекст-менеджер</td></tr>
+        <tr><td><code>REMOTION_SERVICE_URL</code> неверный</td><td>DevOps</td><td><span class="tag medium" style="font-size:0.65rem;">Средний</span></td><td>Исправить на <code>http://remotion:3001</code></td></tr>
+        <tr><td>Нет лимитов ресурсов Docker</td><td>DevOps</td><td><span class="tag medium" style="font-size:0.65rem;">Средний</span></td><td>Добавить memory/CPU лимиты</td></tr>
+        <tr><td>Whisper в ML-сервис</td><td>Backend</td><td><span class="tag easy" style="font-size:0.65rem;">Низкий</span></td><td>Запланировать при Фазе 2</td></tr>
+        <tr><td><code>isCurrent</code> в Captions.tsx</td><td>Remotion</td><td><span class="tag easy" style="font-size:0.65rem;">Низкий</span></td><td>Сравнивать по индексу</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="divider"></div>
+
+  <!-- Specialists -->
+  <h2>Отчёты специалистов</h2>
+  <p class="dim">Ключевые файлы, которые изучал каждый:</p>
+  <ul>
+    <li><strong>ML-инженер:</strong> <code>transcription/service.py</code>, <code>tasks/service.py</code>, <code>pyproject.toml</code></li>
+    <li><strong>Backend-архитектор:</strong> <code>tasks/service.py</code>, <code>jobs/schemas.py</code>, <code>media/service.py</code>, <code>captions/service.py</code>, <code>docker-compose.yml</code></li>
+    <li><strong>Remotion-инженер:</strong> <code>Composition.tsx</code>, <code>Captions.tsx</code>, <code>Root.tsx</code>, <code>useCaptions.ts</code>, все типы</li>
+    <li><strong>Frontend-архитектор:</strong> <code>TimelinePanel/</code>, <code>FragmentsStep/</code>, <code>WizardContext.tsx</code>, <code>notifications/</code></li>
+    <li><strong>DevOps-инженер:</strong> <code>docker-compose.yml</code>, <code>Dockerfile</code>, <code>pyproject.toml</code>, <code>uv.lock</code></li>
+    <li><strong>Инженер по производительности:</strong> <code>tasks/service.py</code>, <code>media/service.py</code>, <code>transcription/service.py</code>, <code>docker-compose.yml</code></li>
+  </ul>
+
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/consults/video-features-roadmap_v1_ru.md b/docs/consults/video-features-roadmap_v1_ru.md
new file mode 100644
index 0000000..552022a
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v1_ru.md
@@ -0,0 +1,432 @@
+# Дорожная карта видеофич — Техническая консультация v1
+
+**Дата:** 22 марта 2026
+**Консультанты:** ML/AI-инженер, Backend-архитектор, Remotion-инженер, Frontend-архитектор, DevOps-инженер, Инженер по производительности
+
+---
+
+## Общая картина
+
+Четыре фичи. От простого к безумному. Вот что получается, если разложить всё по полочкам:
+
+| # | Фича | Сложность | MVP | Полная версия | Доп. инфраструктура |
+|---|-------|-----------|-----|---------------|---------------------|
+| 1 | Продвинутые шаблоны Remotion | Легко-средне | 3-4 дня | 3-4 дня | Ничего — можно начинать хоть сейчас |
+| 2 | Детекция вирусных моментов | Средне | 5-7 дней | 8-12 дней | Только API-ключ для LLM |
+| 3 | Авто-монтаж и трекинг лица | Очень сложно | 12-15 дней | 30-45 дней | Фаза 1: ничего; Фаза 2: GPU-воркер |
+| 4 | Конвертация в вертикальные Shorts (9:16) | Средне | 6-8 дней | +3-4 дня после #3 | Ничего |
+| **Итого** | | | **26-34 дня** | **44-65 дней** | |
+
+Реалистичный прогноз для одного разработчика: **6-8 недель** (все MVP) или **3-4 месяца** (полные версии).
+
+---
+
+## Фича 1: Продвинутые шаблоны Remotion
+
+**Статус:** Спецификация и план реализации уже написаны. Бери и делай.
+
+- Спецификация: `docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md`
+- План: `docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md`
+
+**Что делаем:** Расширяем `CaptionStyleSchema` четырьмя новыми стилями подсветки слов (pop_in, karaoke, bounce, glow_pulse), двумя переходами (zoom_in, drop_in), тремя полями (word_entrance, highlight_rotation_deg, text_transform). Добавляем два системных пресета: "Shorts" и "Podcast".
+
+**Где трогаем код:** Расширение схемы в Remotion + бэкенде, логика рендеринга в `Captions.tsx`, Alembic-миграция для пресетов, контролы в StyleEditor на фронте.
+
+**Особый интерес специалистов не требуется** — всё спроектировано, новой инфраструктуры нет. Самая безрисковая фича в этом списке.
+
+---
+
+## Фича 2: Детекция вирусных моментов
+
+Вот тут начинается самое интересное. За $0.005 за видео мы можем автоматически находить самые цепляющие фрагменты в подкастах и интервью. Пять копеек — и AI выкладывает тебе на блюдце моменты, которые зрители пересылают друг другу.
+
+### Архитектура
+
+**LLM API:** Gemini 2.5 Flash — лучшая поддержка русского языка, $0.15/$0.60 за 1М токенов. Альтернатива: GPT-4o-mini (те же деньги, русский чуть хуже). Стоимость анализа одного 30-минутного видео: примерно $0.005.
+
+**Аудио-подкрепление:** `librosa` для кривых RMS-энергии — уточняет границы клипов до естественных пауз, повышает скор для энергичных сегментов. Зависимость ~20МБ, обработка 30-минутного аудио за <10 секунд.
+
+**Пайплайн:**
+1. Берём транскрипцию из БД
+2. librosa считает огибающую энергии по всему аудио (разрешение 100мс)
+3. LLM анализирует текст транскрипции через промпт со structured JSON output
+4. Постобработка: привязка границ клипов к точкам низкой энергии, расчёт energy-скоров
+5. Сохраняем клипы в новую таблицу `clips`
+
+### Бэкенд
+
+**Новый модуль:** `clips` (models, schemas, repository, service, router) — хранит найденные клипы со связями project/file/job.
+
+**Модель клипа:**
+```
+Clip {
+  project_id: UUID (FK projects)
+  source_file_id: UUID (FK files)
+  job_id: UUID? (FK jobs)
+  title: str
+  start_ms: int
+  end_ms: int
+  score: float
+  source_type: "viral_detected" | "user_created" | "auto_generated"
+  status: "pending" | "approved" | "rejected" | "exported"
+  meta: JSON? (рассуждения LLM, теги, хэштеги)
+}
+```
+
+**Новый тип джоба:** `VIRAL_DETECT` в `JobTypeEnum`. Актор вызывает LLM API напрямую через `httpx` из Dramatiq-воркера — отдельный сервис не нужен.
+
+**Интеграция с LLM:**
+- Прямой HTTP-вызов из актора с retry + exponential backoff на 429
+- Промпты хранятся в `cpv3/infrastructure/prompts/viral_detection_v1.txt`
+- Активная версия контролируется через env-переменную `LLM_VIRAL_PROMPT_VERSION`
+- Новые настройки: `LLM_API_URL`, `LLM_API_KEY`, `LLM_MODEL_NAME`
+
+### Фронтенд
+
+- Новый `ViralClipsStep` в визарде проекта (features/project/)
+- Список клипов с превьюшками, скорами, заголовками, кнопками принять/отклонить
+- Модалка редактирования клипа с видео-превью (воспроизведение ограничено диапазоном start/end)
+- Новый тип джоба `VIRAL_DETECT` в обработке нотификаций (через существующую WebSocket-инфраструктуру)
+
+### Ключевые цифры
+
+| Метрика | Значение |
+|---|---|
+| Точность (precision) | 50-70% |
+| Полнота (recall) | 60-80% |
+| Время обработки | 10-20 секунд |
+| Стоимость за видео | ~$0.005 |
+| Стоимость при 1 000 видео/месяц | ~$5 |
+| Новые зависимости | `google-generativeai` или `openai` (~10МБ) + `librosa` (~20МБ) |
+
+10-20 секунд и пять долларов за тысячу видео. Вдумайтесь в эти цифры.
+
+### Риски
+
+- **Качество промпт-инжиниринга** определяет ценность фичи — придётся итерировать по фидбеку пользователей
+- **Визуальные моменты** (мимика, физическая комедия) из текста не ловятся — ~20-30% вирусных моментов проходят мимо
+- **Качество транскрипции критично** — Whisper `tiny` даёт ~25% WER на русском; для входных данных вирусной детекции использовать минимум `small`
+- **LLM галлюцинирует таймстемпы** — обязательно валидировать возвращённые метки времени против реальных границ сегментов
+
+### MVP vs Полная версия
+
+- **MVP:** Только текстовый анализ через LLM, без аудио-энергии. Возвращает клипы со скорами. Пользователь ревьюит и принимает/отклоняет.
+- **Полная:** Добавляем librosa-анализ энергии, few-shot примеры из принятых пользователем клипов, пакетную обработку, прямой экспорт клипов в 9:16.
+
+---
+
+## Фича 3: Авто-монтаж и трекинг лица
+
+Самая амбициозная фича. Самая сложная. И, возможно, самая впечатляющая для пользователей. Представьте: загружаете подкаст с двумя спикерами, а на выходе получаете динамичное вертикальное видео, где камера сама «следит» за говорящим — как будто это снимал живой оператор.
+
+### Архитектура
+
+**Детекция лиц:** MediaPipe BlazeFace (Apache 2.0, модель ~2МБ, 30-60 FPS на CPU). Сэмплируем на 3 FPS — позиции лиц не меняются значительно за 330мс. Зависимость: `mediapipe` (~30МБ).
+
+**Диаризация спикеров:** pyannote.audio 3.1 (MIT, ~10% DER, self-hosted). На CPU работает со скоростью 0.17-0.33x реального времени (5-10 минут на 30-минутное аудио). GPU ускоряет до 1-2 минут. Зависимости: `pyannote-audio` (~200МБ) + `torchaudio` (~50-80МБ). PyTorch уже установлен через Whisper.
+
+**Маппинг лицо-спикер:**
+- **Фаза 1:** Эвристика по временнОй корреляции — сопоставляем треки лиц с сегментами спикеров по максимальному пересечению во времени. 70-85% точности для видео с двумя спикерами. Ноль дополнительных зависимостей. Около 100 строк Python.
+- **Фаза 2:** TalkNet-ASD (Active Speaker Detection) — совместный анализ движения губ и аудио для определения, кто говорит. 92.3% точности. Требует `torchvision` + веса модели (~50МБ). Нужен GPU (2-5 FPS на CPU vs 15-25 FPS на GPU).
+
+**Видео-композитинг (подход Remotion):**
+
+Динамический кроп через CSS `transform: scale() translate()` на элементе `<Video>` внутри контейнера с `overflow: hidden`. Это GPU-ускоренная браузерная операция — по сути бесплатная с точки зрения производительности. Для самого кропа пере-кодирование через FFmpeg не нужно.
+
+**Новые Remotion-композиции:**
+
+| Композиция | Назначение | Фаза |
+|---|---|---|
+| `CaptionedVideo` (существует) | Наложение субтитров на нативное видео | Текущая |
+| `ShortsVideo` (новая) | Статический/ключевой кроп + субтитры в 9:16 | Фича 4 |
+| `AutoEditVideo` (новая) | Кроп с трекингом лица + монтаж + субтитры | Фича 3 (полная) |
+
+Все композиции разделяют компонент `<Captions>` и хук `useCaptions`.
+
+**Формат данных кропа (ключевые кадры):**
+```typescript
+type FaceKeyframe = {
+  time: number;       // секунды
+  x: number;          // центр лица, 0.0-1.0 нормализовано
+  y: number;          // центр лица, 0.0-1.0 нормализовано
+  width: number;      // ширина bounding box, 0.0-1.0
+  height: number;     // высота bounding box, 0.0-1.0
+  speakerId?: string;
+};
+
+type CropTrack = {
+  keyframes: FaceKeyframe[];
+  interpolation: "linear" | "ease" | "smooth";
+  zoom: number;       // базовый множитель зума
+  safeMargin: number; // отступ вокруг лица (0.1 = 10%)
+};
+```
+
+Remotion `interpolate()` между ключевыми кадрами для плавного панорамирования/зума. `spring()` используется только для жёстких переключений между спикерами.
+
+### Бэкенд
+
+**Новые типы джобов:** `FACE_DETECT`, `SPEAKER_DIARIZE` в `JobTypeEnum`. Результаты хранятся в `Job.output_data` (JSON) — новая таблица для данных лица/диаризации не нужна.
+
+**Отделение ML-сервиса:**
+- **Фаза 1:** Оставляем в Dramatiq-воркерах (тот же образ). MediaPipe + pyannote добавляют только ~280МБ к образу.
+- **Фаза 2:** Отдельный Docker-контейнер `ml-worker` на выделенных очередях Dramatiq (`ml_head_tracking`, `ml_diarization`). Тот же код, другой образ, другие лимиты ресурсов.
+
+**Изменения в Remotion-сервисе:** `POST /api/render` нужен параметр `compositionId` для выбора композиции. Props расширяются полями `crop`, `outputWidth`, `outputHeight`.
+
+### Время обработки (30-минутное 1080p видео)
+
+| Шаг | CPU | GPU |
+|---|---|---|
+| Извлечение аудио (FFmpeg) | 10-20 сек | 10-20 сек |
+| Детекция лиц (MediaPipe, 3 FPS) | 1-2 мин | 10-15 сек |
+| Диаризация спикеров (pyannote) | **15-30 мин** | 1-2 мин |
+| Маппинг лицо-спикер | < 1 сек | < 1 сек |
+| Рендер Remotion (кроп + субтитры) | 10-30 мин | 10-30 мин |
+| **Итого (с параллелизацией)** | **35-80 мин** | **16-40 мин** |
+
+Детекция лиц и диаризация могут работать параллельно — у них разные входные данные (видеокадры vs аудиодорожка).
+
+### Требования к памяти
+
+| Конфигурация | Пиковое потребление RAM |
+|---|---|
+| Whisper base + pyannote (параллельно) | 8-12 ГБ |
+| Whisper medium + pyannote (параллельно) | 12-16 ГБ |
+| Рекомендуемый лимит ML-воркера | 16 ГБ, `--threads 1` |
+
+### Фронтенд
+
+- Превью трекинга лица: видеоплеер с наложением bounding box через canvas
+- Трек спикеров в TimelinePanel (расширяет существующую систему из 4 треков)
+- Контролы: слайдер уровня зума, скорость перехода, выбор спикера
+- Переключатель сравнения «до/после»
+- UX-флоу: загрузка подкаста -> запуск анализа (ProcessingStep) -> ревью назначений спикеров -> корректировка -> экспорт
+
+### Ключевые цифры
+
+| Метрика | Значение |
+|---|---|
+| Точность детекции лиц | ~90% (MediaPipe на talking-head контенте) |
+| DER диаризации | ~10% (pyannote 3.1) |
+| Точность маппинга лицо-спикер (Фаза 1) | 70-85% |
+| Точность маппинга лицо-спикер (Фаза 2, TalkNet) | ~92% |
+| Новые зависимости | ~280МБ (mediapipe + pyannote + torchaudio) |
+| GPU обязателен? | Нет для Фазы 1; рекомендуется для Фазы 2 |
+
+### Риски
+
+- **Маппинг лицо-спикер** — главная нерешённая подзадача. Точность 70-85% означает, что каждое пятое назначение может быть неверным. Пользователь должен иметь возможность поправить вручную.
+- **Диаризация на CPU** — бутылочное горлышко. 15-30 минут на 30-минутное видео. GPU сокращает до 1-2 минут.
+- **Конфликты версий PyTorch** между Whisper и pyannote — обязательно тестировать `uv sync` перед коммитом.
+- **Потеря качества видео** при кропе 16:9 -> 9:16 — остаётся только ~31.6% ширины кадра. Исходник должен быть минимум 1080p.
+- **Скачивание моделей при первом запуске** — модели pyannote (~100МБ) требуют принятия лицензии на Hugging Face. Обрабатывать в Dockerfile, не в рантайме.
+
+### MVP vs Полная версия
+
+- **MVP (12-15 дней):** Детекция лиц на сэмплированных кадрах. Пользователь вручную выбирает, за каким лицом следить. Статический кроп на выбранное лицо. Без переключения спикеров, без диаризации. Работает для одного спикера.
+- **Полная (30-45 дней):** Диаризация + маппинг лицо-спикер. Динамический кроп, следующий за активным спикером. Плавные spring()-переходы при смене спикеров. Сплит-скрин для реакций. Поддержка нескольких спикеров.
+
+---
+
+## Фича 4: Конвертация в вертикальные Shorts (9:16)
+
+### Архитектура
+
+**Пайплайн:** Сначала кроп, потом субтитры — всегда. Один проход рендеринга в Remotion через новую композицию `ShortsVideo`. Композиция рендерит в целевых размерах 9:16, применяет CSS-кроп к `<Video>` и накладывает субтитры поверх.
+
+**Позиционирование субтитров:** Новые поля в схеме не нужны. Бэкенд корректирует `font_size`, `padding_px`, `max_width_pct` в `styleConfig` под соотношение 9:16. Remotion — это «глупый рендерер»: логика о том, что выглядит хорошо в 9:16, живёт в пресетах.
+
+**Спецификация кропа:**
+```typescript
+type CropConfig = {
+  mode: "static" | "keyframe";
+  staticCrop?: { x: number; y: number; zoom: number };  // 0-1 нормализовано
+  keyframes?: Array<{ time: number; x: number; y: number; zoom: number }>;
+  interpolation?: "linear" | "ease" | "smooth";
+};
+```
+
+Статический кроп — вырожденный случай ключевого кропа (один ключевой кадр).
+
+### Бэкенд
+
+**Новый тип джоба:** `ASPECT_CONVERT` в `JobTypeEnum`. Новая функция `crop_to_vertical()` в `media/service.py` через FFmpeg crop+scale фильтр.
+
+**Новый тип артефакта:** `VERTICAL_VIDEO` в `ArtifactTypeEnum`.
+
+**Пайплайн:**
+1. Обрезка исходного видео до временного диапазона клипа (если из вирусной детекции)
+2. Применение кропа (статический центральный кроп или face-tracking кроп из Фичи 3)
+3. Загрузка в S3 по пути `{folder}/vertical/{filename}`
+4. Webhook + нотификация
+
+### Фронтенд
+
+- Превью кропа: перетаскиваемый прямоугольник 9:16 поверх видеоплеера (CSS `object-fit: cover` + `object-position`)
+- Переключатель side-by-side превью: оригинал 16:9 vs обрезанное 9:16
+- Интеграция с Фичей 2: кнопка «Конвертировать в Short» на каждом одобренном вирусном клипе
+- Интеграция с Фичей 3: автозаполнение региона кропа из данных детекции лица
+
+### Время обработки
+
+| Подход | Время (30-мин видео) |
+|---|---|
+| FFmpeg кроп (без субтитров) | 12-36 мин |
+| Remotion кроп + субтитры (один проход) | 11-45 мин |
+| FFmpeg с NVENC (аппаратное кодирование) | 3-5 мин |
+
+### MVP vs Полная версия
+
+- **MVP (6-8 дней):** Ручной выбор региона кропа с превью. Пользователь перетаскивает прямоугольник 9:16 поверх видео. Новая `ShortsVideo` Remotion-композиция рендерит кроп + субтитры.
+- **Полная (+3-4 дня после Фичи 3):** Авто-кроп на основе данных детекции лица. Конвертация в один клик. Пакетная конвертация вирусных клипов.
+
+---
+
+## Рекомендуемый порядок разработки
+
+```
+Неделя 1-2:    Фича 1 (Шаблоны)              ████████
+Неделя 2-4:    Фича 2 (Вирусная детекция)     ████████████████
+Неделя 4-6:    Фича 4 MVP (9:16 кроп)         ████████████████
+Неделя 6-14:   Фича 3 (Трекинг лица)          ████████████████████████████████████████
+Неделя 14-15:  Фича 4 (апгрейд)               ████████
+```
+
+**Почему именно так:**
+1. **Шаблоны первыми** — готовы к реализации, нулевой риск, моментальная польза для пользователей
+2. **Вирусная детекция второй** — лучшее соотношение пользы к трудозатратам ($0.005/видео, 5-7 дней MVP), валидирует гипотезу о том, что пользователи хотят автоматический монтаж
+3. **9:16 MVP третьим** — создаёт композицию `ShortsVideo`, которую потом расширит Фича 3; полезна сама по себе с ручным кропом
+4. **Трекинг лица последним** — самая сложная, самые большие вложения; к этому моменту Фичи 2 и 4 уже валидируют спрос
+5. **Апгрейд 9:16** — тривиален, когда трекинг лица уже даёт позиции
+
+---
+
+## Анализ стоимости
+
+### Стоимость обработки одного видео
+
+| Уровень | Состав | Вычисления | LLM API | Итого | Время ожидания |
+|---|---|---|---|---|---|
+| Только CPU | Всё на CPU | $0.05 | $0.06 | **$0.11** | 35-80 мин |
+| GPU (T4) | ML на GPU, FFmpeg на CPU | $0.11 | $0.06 | **$0.17** | 16-40 мин |
+| GPU + NVENC | Всё на GPU | $0.13 | $0.06 | **$0.19** | 10-15 мин |
+
+Одиннадцать центов на CPU. Девятнадцать с GPU. Даже на самом дорогом варианте — меньше двадцати центов за полный пайплайн с AI-анализом, трекингом лица и кодированием видео.
+
+### Месячная стоимость инфраструктуры (100 видео/месяц)
+
+| Сценарий | Стоимость |
+|---|---|
+| Только CPU (текущая инфра) | ~$11 + сервер |
+| Modal serverless GPU | ~$21/месяц |
+| Spot GPU (g4dn.xlarge) | ~$115/месяц |
+| Постоянный GPU (g4dn.xlarge 24/7) | ~$380/месяц |
+
+**Рекомендация:** Начинаем на CPU. Переходим на Modal serverless GPU, когда время ожидания в очереди превышает 15 минут. При 500+ видео/день — смотрим на spot-инстансы.
+
+### Предлагаемые тарифы SaaS
+
+| Тариф | Цена | Ограничения | Себестоимость | Маржа |
+|---|---|---|---|---|
+| Free | $0 | Видео до 10 мин, низкий приоритет в очереди | ~$0.04/видео | Маркетинг |
+| Pro | $15-30/мес | Видео до 30 мин, GPU ML | ~$0.17/видео при 50 видео | 60-80% |
+| Business | $50-100/мес | Видео до 60 мин, приоритетная очередь, NVENC | ~$0.38/видео | 70-85% |
+
+Маржинальность 60-85%. При масштабировании — только растёт.
+
+---
+
+## Инфраструктурные решения
+
+### Отделение ML-сервиса
+
+**Фаза 1:** ML остаётся в существующих Dramatiq-воркерах. MediaPipe + pyannote добавляют лишь ~280МБ к образу. PyTorch уже установлен через Whisper.
+
+**Фаза 2:** Отдельный Docker-контейнер `ml-worker` на выделенных очередях. Тот же код, другой образ (`Dockerfile.ml`), другие лимиты ресурсов. Docker Compose profiles:
+
+```bash
+docker-compose up                    # По умолчанию: без ML-воркера
+docker-compose --profile ml up       # С ML-воркером
+```
+
+**НЕ строить отдельный HTTP-микросервис.** Dramatiq уже обеспечивает очередь джобов, ретраи, прогресс и отмену. Добавление HTTP service discovery, API-контрактов и health check — оверхед с нулевой пользой для асинхронных нагрузок.
+
+### Немедленные оптимизации (до начала работы над новыми фичами)
+
+| Действие | Эффект | Трудозатраты |
+|---|---|---|
+| Переключить PyTorch на CPU-only индекс | -800МБ размер образа | 1 час |
+| Исправить дефолт `REMOTION_SERVICE_URL` в воркере | Баг-фикс | 5 мин |
+| Добавить лимиты ресурсов к docker-compose сервисам | Предотвращение каскадных OOM | 30 мин |
+| Разбить Dramatiq на пулы очередей (легковесные vs ML vs вычисления) | Предотвращение голодания воркеров | 2-3 часа |
+
+Четыре задачи. Суммарно полдня. Экономия: 800МБ, один баг, и страховка от того, что ML-джоб сожрёт всю память и уронит API.
+
+---
+
+## Сводка по технологическому стеку
+
+### Новые зависимости
+
+| Пакет | Размер | Назначение | Фича |
+|---|---|---|---|
+| `google-generativeai` или `openai` | ~10 МБ | LLM API клиент | 2 |
+| `librosa` | ~20 МБ | Анализ энергии аудио | 2 |
+| `mediapipe` | ~30 МБ | Детекция лиц | 3 |
+| `pyannote-audio` | ~200 МБ | Диаризация спикеров | 3 |
+| `torchaudio` | ~50-80 МБ | Обработка аудио для pyannote | 3 |
+| **Всего новых зависимостей** | **~310-340 МБ** | | |
+
+### Новые бэкенд-модули
+
+| Модуль | Назначение | Фича |
+|---|---|---|
+| `clips` | CRUD клипов, воркфлоу ревью | 2 |
+
+### Новые Remotion-композиции
+
+| Композиция | Назначение | Фича |
+|---|---|---|
+| `ShortsVideo` | Статический/ключевой кроп + субтитры в 9:16 | 4 |
+| `AutoEditVideo` | Динамический кроп с трекингом лица + субтитры | 3 |
+
+### Новые типы джобов
+
+| Тип джоба | Назначение | Фича |
+|---|---|---|
+| `VIRAL_DETECT` | LLM-анализ транскрипции | 2 |
+| `ASPECT_CONVERT` | 9:16 кроп + пере-кодирование | 4 |
+| `FACE_DETECT` | Детекция bounding box лиц | 3 |
+| `SPEAKER_DIARIZE` | Диаризация спикеров | 3 |
+
+---
+
+## Сквозные проблемы
+
+Шесть специалистов — шесть взглядов на одну кодовую базу. Вот что они нашли:
+
+| Проблема | Кто нашёл | Приоритет | Действие |
+|---|---|---|---|
+| PyTorch тащит CUDA-библиотеки на CPU-only инфру (+800МБ) | DevOps | Высокий | Переключить на CPU-only PyTorch индекс |
+| Воркер с `--processes 1 --threads 2` упадёт по OOM на ML-джобах | Performance | Высокий | Разбить на пулы очередей, `--threads 1` для ML |
+| `_get_job_status_sync()` течёт соединениями к БД | Performance | Высокий | Починить до добавления новых акторов |
+| Нет очистки временных файлов при OOM-крэше | Performance | Средний | Добавить периодическую очистку `/tmp` или cron |
+| `tasks/service.py` — 1 674 строки, скоро перевалит за 2К | Backend | Средний | Вынести бойлерплейт акторов в декоратор/контекст-менеджер |
+| Дефолт `REMOTION_SERVICE_URL` в воркере неверный (`localhost:8001`) | DevOps | Средний | Исправить на `http://remotion:3001` в docker-compose |
+| Ни на одном Docker-сервисе нет лимитов ресурсов | DevOps | Средний | Добавить memory/CPU лимиты на все сервисы |
+| Whisper со временем нужно переместить в ML-сервис | Backend | Низкий | Запланировать при разделении ML-воркера в Фазе 2 |
+| Проверка `isCurrent` слова в Captions.tsx хрупкая | Remotion | Низкий | Сравнивать по индексу, а не по тексту + start time |
+
+---
+
+## Отчёты специалистов
+
+Полные выводы специалистов доступны в стенограмме сессии. Ключевые файлы, которые изучал каждый:
+
+- **ML-инженер:** `cpv3/modules/transcription/service.py`, `cpv3/modules/tasks/service.py`, `pyproject.toml`
+- **Backend-архитектор:** `cpv3/modules/tasks/service.py`, `cpv3/modules/jobs/schemas.py`, `cpv3/modules/media/service.py`, `cpv3/modules/captions/service.py`, `docker-compose.yml`
+- **Remotion-инженер:** `remotion_service/src/components/Composition.tsx`, `Captions.tsx`, `Root.tsx`, `useCaptions.ts`, `useVideoMeta.ts`, все определения типов
+- **Frontend-архитектор:** `src/widgets/TimelinePanel/`, `src/features/project/FragmentsStep/`, `src/shared/context/WizardContext.tsx`, `src/shared/store/notifications/`
+- **DevOps-инженер:** `docker-compose.yml`, `Dockerfile`, `pyproject.toml`, `uv.lock`
+- **Инженер по производительности:** `cpv3/modules/tasks/service.py`, `cpv3/modules/media/service.py`, `cpv3/modules/transcription/service.py`, `docker-compose.yml`
diff --git a/docs/consults/video-features-roadmap_v2.md b/docs/consults/video-features-roadmap_v2.md
new file mode 100644
index 0000000..024c80c
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v2.md
@@ -0,0 +1,515 @@
+# Video Features Roadmap — Technical Consultation v2 (API-First)
+
+**Date:** 2026-03-22
+**Specialists consulted:** ML/AI Engineer, Backend Architect, Remotion Engineer, Frontend Architect, DevOps Engineer, Performance Engineer
+**Revision:** v2 — switched to API-first architecture using Deepgram, GigaChat, and DeepInfra
+
+---
+
+## What Changed from v1
+
+v2 replaces local ML models with managed API services. This is the single biggest architectural change — it eliminates PyTorch, GPU infrastructure, ML worker separation, and most memory/processing bottlenecks.
+
+### API Substitutions
+
+| v1 (Local ML) | v2 (API-First) | Impact |
+|---|---|---|
+| Local Whisper (PyTorch, 20-60 min CPU) | **Deepgram Nova-3** API (~30 sec) | Eliminates PyTorch dependency entirely |
+| Local pyannote.audio (PyTorch, 15-30 min CPU) | **Deepgram** `diarize=true` (included in transcription call) | Eliminates pyannote + torchaudio deps |
+| Gemini 2.5 Flash / GPT-4o-mini for viral detection | **GigaChat Pro** (native Russian LLM by Sber) | Better Russian cultural context, humor, slang |
+| librosa for audio energy analysis | **Deepgram** `sentiment=true` per utterance | Sentiment replaces energy analysis for most cases |
+| N/A | **DeepInfra** (Llama, Mistral, Qwen via API) | Fallback/A/B testing for LLM analysis |
+
+### Key Metrics Changed
+
+| Metric | v1 | v2 | Change |
+|---|---|---|---|
+| Docker image size | 1.72 GB | **~400-500 MB** | -75% (no PyTorch) |
+| Peak worker RAM | 8-16 GB | **~400 MB** (MediaPipe only) | -95% |
+| Processing time (30-min video, full pipeline) | 35-80 min (CPU) | **5-10 min** | -85% |
+| Per-video cost | $0.11 | **$0.20** | +80% (API costs) |
+| Monthly cost (100 videos) | $11 compute + server + $0-380 GPU | **$20 APIs + server** | Simpler, cheaper at low volume |
+| GPU needed? | Phase 2 for diarization | **Never** | Eliminated |
+| New Python dependencies | ~310-340 MB | **~40 MB** (mediapipe + HTTP clients) | -88% |
+| MVP total timeline | 26-34 dev-days | **20-27 dev-days** | -20-25% |
+
+### Issues Eliminated
+
+These v1 cross-cutting issues no longer apply:
+
+| v1 Issue | Why It's Gone |
+|---|---|
+| ~~Switch PyTorch to CPU-only index~~ | PyTorch removed entirely (Whisper replaced by Deepgram) |
+| ~~Worker OOM with concurrent ML jobs~~ | No heavy ML — standard 4 GB worker |
+| ~~Separate ML worker Docker image~~ | Single lightweight image |
+| ~~GPU infrastructure planning~~ | All ML is API-based |
+| ~~PyTorch version conflicts~~ | No PyTorch |
+| ~~Model download on first run~~ | No local models (except MediaPipe, ~2 MB) |
+| ~~ML worker separation via Docker Compose profiles~~ | Not needed |
+
+### New Issues Introduced
+
+| Issue | Priority | Mitigation |
+|---|---|---|
+| API key management (Deepgram, GigaChat, DeepInfra) | High | Store in settings via env vars, never in code |
+| API rate limits | High | Retry with exponential backoff in actors |
+| API vendor lock-in | Medium | Abstract behind engine interfaces (like current `engine: "whisper" \| "google"`) |
+| Network dependency (API downtime = no processing) | Medium | Keep Whisper as optional fallback engine |
+| Higher per-video cost ($0.20 vs $0.11) | Low | Offset by zero infrastructure cost; profitable at any SaaS tier |
+
+---
+
+## Feature Overview
+
+| # | Feature | Complexity | MVP | Full | Additional Infra |
+|---|---------|-----------|-----|------|-----------------|
+| 1 | Advanced Remotion Templates | Easy-Medium | 3-4 days | 3-4 days | None — ready to implement |
+| 2 | Viral Moments Detection | Medium | **3-5 days** | 6-10 days | API keys (GigaChat, Deepgram) |
+| 3 | Auto-Cut & Head Tracking | Hard | **8-10 days** | 20-30 days | MediaPipe only (CPU, ~30 MB) |
+| 4 | 9:16 Shorts Conversion | Medium | 6-8 days | +3-4 days after #3 | None |
+| **Total** | | | **20-27 days** | **35-47 days** | |
+
+Realistic for one dev: **5-7 weeks** (all MVPs) or **2-3 months** (full versions).
+
+---
+
+## Feature 1: Advanced Remotion Templates
+
+**No changes from v1.** This feature has no ML dependencies.
+
+**Status:** Spec + implementation plan already written.
+
+- Spec: `docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md`
+- Plan: `docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md`
+
+**Scope:** Extend `CaptionStyleSchema` with 4 new highlight styles (pop_in, karaoke, bounce, glow_pulse), 2 transitions (zoom_in, drop_in), 3 fields (word_entrance, highlight_rotation_deg, text_transform). Seed 2 system presets: "Shorts" and "Podcast".
+
+**Changes:** Schema extensions in Remotion + backend, rendering logic in `Captions.tsx`, Alembic migration for presets, frontend StyleEditor form controls.
+
+---
+
+## Feature 2: Viral Moments Detection
+
+### Architecture (v2 — API-First)
+
+**Transcription:** Deepgram Nova-3 API with `diarize=true` + `sentiment=true`. Single API call returns word-level timestamps, speaker labels, and per-utterance sentiment scores. Cost: $0.0053/min ($0.16 for 30-min video). Processing: ~30 seconds.
+
+**LLM analysis:** GigaChat Pro (by Sber) — native Russian LLM trained on Russian internet content. Better detection of Russian humor, cultural references, slang, and viral patterns than English-first models. Fallback: DeepInfra (Llama 3.1 70B or Qwen) for A/B testing.
+
+**Audio augmentation:** Deepgram's per-utterance sentiment scores replace `librosa` energy analysis for most use cases. High-sentiment utterances correlate with viral moments. Optional: keep `librosa` for audio loudness analysis (laughter, raised voice) as an enhancement.
+
+**Pipeline:**
+1. Deepgram transcription with `diarize=true` + `sentiment=true` → timestamps + speakers + sentiment
+2. Convert Deepgram response to existing `Document` schema (segments, lines, words)
+3. GigaChat analyzes transcription text + sentiment data → viral clip candidates
+4. Post-process: snap boundaries to segment edges, compute composite scores
+5. Save clips to `clips` table
+
+### Backend Design
+
+**New module:** `clips` (models, schemas, repository, service, router) — stores detected clips with project/file/job relationships.
+
+**Clip model:**
+```
+Clip {
+  project_id: UUID (FK projects)
+  source_file_id: UUID (FK files)
+  job_id: UUID? (FK jobs)
+  title: str
+  start_ms: int
+  end_ms: int
+  score: float
+  source_type: "viral_detected" | "user_created" | "auto_generated"
+  status: "pending" | "approved" | "rejected" | "exported"
+  meta: JSON? (LLM reasoning, tags, hashtags, sentiment data)
+}
+```
+
+**New job type:** `VIRAL_DETECT` added to `JobTypeEnum`. Actor calls GigaChat API via `httpx` from Dramatiq worker.
+
+**Transcription engine extension:** Add `"deepgram"` to the existing engine selection (`engine: "whisper" | "google" | "deepgram"`). Deepgram becomes the default for new transcriptions. Whisper remains as a fallback.
+
+**LLM integration:**
+- GigaChat API via `httpx` (OAuth2 token auth via Sber ID)
+- DeepInfra as fallback (OpenAI-compatible API)
+- Prompts stored in `cpv3/infrastructure/prompts/viral_detection_v1.txt`
+- Active version controlled by `LLM_VIRAL_PROMPT_VERSION` env var
+- New settings: `GIGACHAT_CLIENT_ID`, `GIGACHAT_CLIENT_SECRET`, `DEEPINFRA_API_KEY`, `DEEPGRAM_API_KEY`
+
+### Frontend Design
+
+- New `ViralClipsStep` in project wizard (features/project/)
+- Clip list with thumbnails, scores, titles, approve/reject buttons
+- Clip edit modal with video preview (scoped playback for start/end range)
+- New job type `VIRAL_DETECT` in notification handling (existing WebSocket infrastructure)
+
+### Key Numbers
+
+| Metric | v1 | v2 |
+|---|---|---|
+| Transcription time | Depends on Whisper (already done) | ~30 sec (Deepgram, if not already transcribed) |
+| LLM analysis time | 10-20 sec | 10-20 sec (same) |
+| Total processing | 10-20 sec (after transcription) | **40-50 sec** (including Deepgram transcription) |
+| Cost per video | ~$0.005 (LLM only) | **~$0.17** ($0.16 Deepgram + $0.01 GigaChat) |
+| Accuracy (precision) | 50-70% | **60-80%** (GigaChat better at Russian + sentiment data) |
+| New dependencies | `google-generativeai` + `librosa` (~30 MB) | **HTTP client only** (~0 MB new) |
+| MVP time | 5-7 days | **3-5 days** |
+
+### Risks
+
+- **GigaChat API availability** — Sber's API may have lower uptime than Google/OpenAI. Mitigation: DeepInfra fallback.
+- **GigaChat structured output** — verify JSON mode / function calling works reliably for clip extraction. Test early.
+- **Deepgram Russian WER** — ~10-12% WER on Russian (Nova-3). Comparable to Whisper `medium`. Sufficient for viral detection.
+- **Visual-only moments** still missed (~20-30%) — same limitation as v1.
+
+### MVP vs Full
+
+- **MVP (3-5 days):** Deepgram transcription + GigaChat analysis. Returns clips with scores. User reviews and accepts/rejects. No audio energy analysis.
+- **Full (6-10 days):** Add sentiment-weighted scoring, few-shot prompt tuning from user feedback, batch processing, direct clip export to 9:16, DeepInfra A/B testing.
+
+---
+
+## Feature 3: Auto-Cut & Head Tracking
+
+### Architecture (v2 — API-First)
+
+**Face detection:** MediaPipe BlazeFace (unchanged from v1). Apache 2.0, ~2MB model, 30-60 FPS on CPU. Sample at 3 FPS. **This is the only local ML component remaining.** Dependency: `mediapipe` (~30MB).
+
+**Speaker diarization:** **Deepgram API** with `diarize=true` (~30 seconds for 30-min video). Replaces pyannote.audio entirely. Diarization is included in the transcription call — no additional API cost.
+
+**Face-speaker mapping:**
+- Phase 1: Temporal correlation heuristic — match face tracks to Deepgram speaker segments by maximum temporal overlap. 70-85% accuracy for 2-speaker videos. Zero additional dependencies. ~100 lines of Python.
+- Phase 2: TalkNet-ASD — if needed for accuracy. This is the only scenario where GPU would be reconsidered, but can be deferred indefinitely if temporal correlation + user correction is sufficient.
+
+**Video compositing:** Same as v1 — Remotion compositions with CSS transform crop. No changes.
+
+**New Remotion compositions:** Same as v1.
+
+| Composition | Purpose | Phase |
+|---|---|---|
+| `CaptionedVideo` (existing) | Caption overlay on native video | Current |
+| `ShortsVideo` (new) | Static/keyframe crop + captions at 9:16 | Feature 4 |
+| `AutoEditVideo` (new) | Face-tracking crop + cuts + captions | Feature 3 full |
+
+**Crop data format:** Same as v1 (keyframes with normalized 0-1 coordinates).
+
+### Backend Design
+
+**New job types:** `FACE_DETECT` added to `JobTypeEnum`. `SPEAKER_DIARIZE` is **no longer needed as a separate job** — diarization comes from Deepgram as part of transcription.
+
+**ML service separation:** **Not needed.** MediaPipe is lightweight (~30MB, ~400MB RAM). Runs in standard Dramatiq worker.
+
+**Remotion service changes:** Same as v1 — `compositionId` parameter, `crop`/`outputWidth`/`outputHeight` props.
+
+### Processing Time (30-min 1080p video)
+
+| Step | v1 (CPU) | v2 (API-First) |
+|---|---|---|
+| Deepgram transcription + diarization | N/A | **~30 sec** |
+| Face detection (MediaPipe, 3 FPS) | 1-2 min | 1-2 min (unchanged) |
+| ~~Speaker diarization (pyannote)~~ | ~~15-30 min~~ | **Included in Deepgram** |
+| Face-speaker mapping | < 1 sec | < 1 sec |
+| Remotion render (crop + captions) | 10-30 min | 10-30 min (unchanged) |
+| **Total (parallelized)** | **35-80 min** | **12-33 min** |
+
+**The 15-30 min diarization bottleneck is completely eliminated.**
+
+### Memory Requirements
+
+| Config | v1 | v2 |
+|---|---|---|
+| Peak RAM | 8-16 GB | **~400 MB** (MediaPipe only) |
+| Worker config needed | `--threads 1`, 16 GB limit | Standard worker, 4 GB limit |
+
+### Frontend Design
+
+Same as v1:
+- Head tracking preview: video player with face bounding box overlay (canvas)
+- Speaker timeline track in TimelinePanel
+- Controls: zoom level slider, transition speed, speaker selection
+- Before/after comparison toggle
+
+### Key Numbers
+
+| Metric | v1 | v2 |
+|---|---|---|
+| Diarization time | 15-30 min (CPU) / 1-2 min (GPU) | **~30 sec** (API) |
+| Face detection time | 1-2 min | 1-2 min (unchanged) |
+| Total analysis time | 17-33 min (CPU) | **~2 min** |
+| Full pipeline (with render) | 35-80 min (CPU) | **12-33 min** |
+| Peak RAM | 8-16 GB | **~400 MB** |
+| New dependencies | ~280 MB (mediapipe + pyannote + torchaudio) | **~30 MB** (mediapipe only) |
+| GPU needed? | Phase 2 recommended | **Never** |
+| MVP time | 12-15 days | **8-10 days** |
+
+### Risks
+
+- **Face-to-speaker mapping** accuracy unchanged (70-85% with heuristic) — still the hardest subproblem
+- **Deepgram diarization accuracy** — DER may be slightly worse than pyannote 3.1 (~12-15% vs ~10%). Acceptable for this use case.
+- **Video quality loss** when cropping — unchanged from v1
+- **TalkNet-ASD deferred** — if temporal correlation isn't accurate enough, TalkNet requires GPU. Cross that bridge if needed.
+
+### MVP vs Full
+
+- **MVP (8-10 days):** Face detection on sampled frames. Deepgram provides speaker labels. Temporal correlation maps faces to speakers. User can manually correct. Static crop to selected face.
+- **Full (20-30 days):** Dynamic crop following active speaker. Smooth transitions. Split-screen. Multi-speaker. Optional TalkNet-ASD for accuracy.
+
+---
+
+## Feature 4: 9:16 Shorts Conversion
+
+**No changes from v1.** This feature has no ML dependencies.
+
+### Architecture
+
+**Pipeline:** Crop-then-caption, always. Single Remotion render pass using new `ShortsVideo` composition.
+
+**Caption positioning:** No new schema fields needed. Backend adjusts `font_size`, `padding_px`, `max_width_pct` in `styleConfig` for 9:16.
+
+**Crop specification:**
+```typescript
+type CropConfig = {
+  mode: "static" | "keyframe";
+  staticCrop?: { x: number; y: number; zoom: number };
+  keyframes?: Array<{ time: number; x: number; y: number; zoom: number }>;
+  interpolation?: "linear" | "ease" | "smooth";
+};
+```
+
+### Backend Design
+
+**New job type:** `ASPECT_CONVERT` in `JobTypeEnum`. New function `crop_to_vertical()` in `media/service.py`.
+
+**New artifact type:** `VERTICAL_VIDEO` in `ArtifactTypeEnum`.
+
+### Frontend Design
+
+- Crop preview: draggable 9:16 rectangle overlay on video player
+- Side-by-side preview toggle
+- "Convert to Short" button on approved viral clips
+- Auto-populate crop from face detection data (when available)
+
+### Processing Time
+
+| Approach | Time (30-min video) |
+|---|---|
+| FFmpeg crop-only (no captions) | 12-36 min |
+| Remotion crop + captions (single pass) | 11-45 min |
+| FFmpeg with NVENC hardware encoding | 3-5 min |
+
+### MVP vs Full
+
+- **MVP (6-8 days):** Manual crop region selection with preview. `ShortsVideo` Remotion composition.
+- **Full (+3-4 days after Feature 3):** Auto-crop from face detection. One-click conversion. Batch export.
+
+---
+
+## Recommended Build Order
+
+```
+Week 1-2:    Feature 1 (Templates)        ████████
+Week 2-3:    Feature 2 (Viral Detection)  ██████████
+Week 3-5:    Feature 4 MVP (9:16 crop)    ████████████████
+Week 5-10:   Feature 3 (Head Tracking)    ██████████████████████████████
+Week 10-11:  Feature 4 upgrade            ████████
+```
+
+**Rationale:**
+1. **Templates first** — ready to implement, zero risk, immediate user value
+2. **Viral detection second** — fastest ROI with API-first (3-5 days MVP), validates user demand
+3. **9:16 MVP third** — builds `ShortsVideo` composition, useful standalone
+4. **Head tracking last** — still the most complex, but now much simpler without pyannote/GPU
+5. **9:16 upgrade** — trivial once head tracking provides face data
+
+---
+
+## Cost Analysis
+
+### Per-Video Processing Cost (30-min video, all features)
+
+| Component | v1 (Local ML) | v2 (API-First) |
+|---|---|---|
+| Transcription + diarization | $0.07 compute | **$0.16** (Deepgram) |
+| LLM viral detection | $0.005 (Gemini) | **$0.01** (GigaChat) |
+| Face detection | $0.002 compute | $0.002 compute (unchanged) |
+| FFmpeg/Remotion render | $0.02 compute | $0.02 compute |
+| **Total per video** | **$0.11** | **$0.20** |
+
+### Monthly Cost Comparison
+
+| Scale | v1 (Local ML) | v2 (API-First) |
+|---|---|---|
+| 100 videos/month | $11 compute + server + $0-380 GPU | **$20 APIs + server** |
+| 500 videos/month | $55 + $200-380 GPU = $255-435 | **$100 APIs + server** |
+| 1,000 videos/month | $110 + $380 GPU = $490 | **$200 APIs + server** |
+| 5,000 videos/month | $550 + $380 GPU = $930 | **$1,000 APIs + server** |
+
+**Breakeven:** ~2,000-3,000 videos/month. Below that, APIs are cheaper.
+
+### Suggested SaaS Pricing Tiers
+
+| Tier | Price | Limits | Cost/Video | Margin |
+|---|---|---|---|---|
+| Free | $0 | 10-min videos, 5/month | ~$0.07 | Marketing |
+| Pro | $15-30/mo | 30-min videos, 50/month | ~$0.20 | 50-70% |
+| Business | $50-100/mo | 60-min videos, 200/month | ~$0.35 | 65-80% |
+
+---
+
+## Infrastructure (v2 — Simplified)
+
+### Architecture
+
+```
+Frontend → Backend API → Dramatiq Worker (lightweight: MediaPipe only)
+                              ↕              ↕           ↕
+                         PostgreSQL     Deepgram API   GigaChat API
+                         Redis          (transcription  (viral detection)
+                         S3/MinIO        + diarization)
+                         Remotion        DeepInfra
+                                         (fallback LLM)
+```
+
+### Docker Image
+
+| | v1 | v2 |
+|---|---|---|
+| Base | python:3.11-slim + PyTorch + Whisper + CUDA libs | python:3.11-slim + mediapipe |
+| Size | 1.72 GB | **~400-500 MB** |
+| RAM | 16 GB recommended | **4 GB sufficient** |
+
+**Can remove from `pyproject.toml`:** `openai-whisper` (and transitively PyTorch) — if Deepgram fully replaces Whisper. Keep Whisper as optional dependency (`uv sync --group whisper`) for fallback.
+
+### No ML Service Separation Needed
+
+With only MediaPipe (~30MB, ~400MB RAM) running locally, there is no need for:
+- Separate ML worker container
+- Docker Compose profiles for ML
+- GPU infrastructure
+- Dedicated Dramatiq queues for ML
+
+Standard worker with `--processes 1 --threads 2` handles everything.
+
+### New Settings
+
+```python
+# Deepgram
+deepgram_api_key: str = Field(default="", alias="DEEPGRAM_API_KEY")
+
+# GigaChat (Sber)
+gigachat_client_id: str = Field(default="", alias="GIGACHAT_CLIENT_ID")
+gigachat_client_secret: str = Field(default="", alias="GIGACHAT_CLIENT_SECRET")
+
+# DeepInfra (fallback LLM)
+deepinfra_api_key: str = Field(default="", alias="DEEPINFRA_API_KEY")
+
+# LLM config
+llm_provider: str = Field(default="gigachat", alias="LLM_PROVIDER")  # gigachat | deepinfra
+llm_viral_prompt_version: str = Field(default="v1", alias="LLM_VIRAL_PROMPT_VERSION")
+```
+
+---
+
+## Technology Stack Summary
+
+### New Dependencies (v2)
+
+| Package | Size | Purpose | Feature |
+|---|---|---|---|
+| `mediapipe` | ~30 MB | Face detection (CPU) | 3 |
+| `httpx` | Already installed | API calls to Deepgram, GigaChat, DeepInfra | 2, 3 |
+| **Total new deps** | **~30 MB** | | |
+
+### Removed Dependencies (vs v1)
+
+| Package | Size Saved | Was For |
+|---|---|---|
+| ~~`openai-whisper`~~ | ~50 MB + PyTorch ~2 GB | Transcription (replaced by Deepgram) |
+| ~~`pyannote-audio`~~ | ~200 MB | Diarization (replaced by Deepgram) |
+| ~~`torchaudio`~~ | ~50-80 MB | pyannote dependency |
+| ~~`librosa`~~ | ~20 MB | Audio energy (replaced by Deepgram sentiment) |
+| **Total removed** | **~2.3 GB** | |
+
+### New Backend Modules
+
+| Module | Purpose | Feature |
+|---|---|---|
+| `clips` | Clip CRUD, review workflow | 2 |
+
+### New Remotion Compositions
+
+| Composition | Purpose | Feature |
+|---|---|---|
+| `ShortsVideo` | Static/keyframe crop + captions at 9:16 | 4 |
+| `AutoEditVideo` | Face-tracking dynamic crop + captions | 3 |
+
+### New Job Types
+
+| Job Type | Purpose | Feature |
+|---|---|---|
+| `VIRAL_DETECT` | GigaChat analysis of transcription | 2 |
+| `ASPECT_CONVERT` | 9:16 crop + re-encode | 4 |
+| `FACE_DETECT` | Face bounding box detection (MediaPipe) | 3 |
+
+Note: `SPEAKER_DIARIZE` is **no longer a separate job type** — diarization is included in Deepgram transcription.
+
+### Transcription Engine Extension
+
+```python
+# Extend existing engine selection:
+engine: Literal["whisper", "google", "deepgram"] = "deepgram"
+```
+
+Deepgram becomes the default. Whisper remains as optional fallback (requires `uv sync --group whisper`).
+
+---
+
+## Cross-Cutting Issues (v2)
+
+### Remaining from v1
+
+| Issue | Priority | Action |
+|---|---|---|
+| `_get_job_status_sync()` leaks DB connections | High | Fix before adding more actors |
+| `tasks/service.py` at 1,674 lines, will exceed 2K | Medium | Extract actor boilerplate |
+| Worker `REMOTION_SERVICE_URL` default wrong | Medium | Fix to `http://remotion:3001` |
+| No resource limits on Docker services | Medium | Add memory/CPU limits |
+| No temp file cleanup on OOM crash | Medium | Add periodic cleanup |
+| `isCurrent` word identity check in Captions.tsx fragile | Low | Compare by index |
+
+### New in v2
+
+| Issue | Priority | Action |
+|---|---|---|
+| API key management (3 services) | High | All via env vars in settings, never in code |
+| API rate limit handling | High | Retry with exponential backoff in all actors |
+| API vendor lock-in | Medium | Abstract behind engine interface (existing pattern) |
+| Network dependency (API downtime) | Medium | Keep Whisper as optional fallback engine |
+| Deepgram → Document schema conversion | Medium | Build converter to match existing `Document` structure |
+| GigaChat OAuth2 token refresh | Medium | Token caching with auto-refresh in `infrastructure/` |
+
+### Eliminated from v1
+
+| ~~Issue~~ | Why Gone |
+|---|---|
+| ~~PyTorch CPU-only index~~ | PyTorch removed entirely |
+| ~~Worker OOM with ML jobs~~ | No heavy ML locally |
+| ~~ML worker Docker image~~ | Single lightweight image |
+| ~~GPU infrastructure~~ | All ML is API-based |
+| ~~PyTorch version conflicts~~ | No PyTorch |
+| ~~Model downloads on first run~~ | No local models |
+
+---
+
+## Specialist Reports (Full Transcripts)
+
+Full specialist outputs are available in the session transcript. Key files each specialist examined:
+
+- **ML Engineer:** `cpv3/modules/transcription/service.py`, `cpv3/modules/tasks/service.py`, `pyproject.toml`
+- **Backend Architect:** `cpv3/modules/tasks/service.py`, `cpv3/modules/jobs/schemas.py`, `cpv3/modules/media/service.py`, `cpv3/modules/captions/service.py`, `docker-compose.yml`
+- **Remotion Engineer:** `remotion_service/src/components/Composition.tsx`, `Captions.tsx`, `Root.tsx`, `useCaptions.ts`, `useVideoMeta.ts`, all type definitions
+- **Frontend Architect:** `src/widgets/TimelinePanel/`, `src/features/project/FragmentsStep/`, `src/shared/context/WizardContext.tsx`, `src/shared/store/notifications/`
+- **DevOps Engineer:** `docker-compose.yml`, `Dockerfile`, `pyproject.toml`, `uv.lock`
+- **Performance Engineer:** `cpv3/modules/tasks/service.py`, `cpv3/modules/media/service.py`, `cpv3/modules/transcription/service.py`, `docker-compose.yml`
+
+Note: Specialist reports were produced for v1 architecture (local ML). Their recommendations for Remotion compositions, backend module design, frontend components, and crop data formats remain valid in v2. The infrastructure and ML model recommendations are superseded by the API-first approach.
diff --git a/docs/consults/video-features-roadmap_v2_ru.html b/docs/consults/video-features-roadmap_v2_ru.html
new file mode 100644
index 0000000..dd9c356
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v2_ru.html
@@ -0,0 +1,1341 @@
+<!DOCTYPE html>
+<html lang="ru">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Video Features Roadmap v2 — API-First</title>
+<style>
+  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500;600&display=swap');
+
+  :root {
+    --bg: #0a0a0f;
+    --bg-card: #12121a;
+    --bg-card-hover: #1a1a26;
+    --bg-code: #1e1e2e;
+    --border: #2a2a3a;
+    --border-accent: #3a3a5a;
+    --text: #e4e4ef;
+    --text-dim: #8888a0;
+    --text-muted: #5a5a72;
+    --accent: #7c6aef;
+    --accent-glow: #7c6aef40;
+    --green: #34d399;
+    --green-dim: #34d39930;
+    --red: #f87171;
+    --red-dim: #f8717130;
+    --yellow: #fbbf24;
+    --yellow-dim: #fbbf2430;
+    --blue: #60a5fa;
+    --blue-dim: #60a5fa30;
+    --cyan: #22d3ee;
+    --pink: #f472b6;
+  }
+
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+
+  html {
+    scroll-behavior: smooth;
+    font-size: 16px;
+  }
+
+  body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
+    background: var(--bg);
+    color: var(--text);
+    line-height: 1.7;
+    -webkit-font-smoothing: antialiased;
+  }
+
+  /* ── Hero ── */
+  .hero {
+    position: relative;
+    padding: 5rem 2rem 4rem;
+    text-align: center;
+    overflow: hidden;
+    background: linear-gradient(180deg, #12102a 0%, var(--bg) 100%);
+  }
+
+  .hero::before {
+    content: '';
+    position: absolute;
+    top: -50%;
+    left: 50%;
+    transform: translateX(-50%);
+    width: 800px;
+    height: 800px;
+    background: radial-gradient(circle, var(--accent-glow) 0%, transparent 70%);
+    pointer-events: none;
+  }
+
+  .hero-badge {
+    display: inline-flex;
+    align-items: center;
+    gap: .5rem;
+    padding: .4rem 1rem;
+    border-radius: 100px;
+    background: var(--accent-glow);
+    border: 1px solid #7c6aef50;
+    font-size: .8rem;
+    font-weight: 600;
+    color: var(--accent);
+    letter-spacing: .04em;
+    text-transform: uppercase;
+    margin-bottom: 1.5rem;
+  }
+
+  .hero h1 {
+    font-size: clamp(2rem, 5vw, 3.2rem);
+    font-weight: 800;
+    letter-spacing: -.03em;
+    line-height: 1.15;
+    background: linear-gradient(135deg, #fff 0%, #b8b0f0 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    max-width: 700px;
+    margin: 0 auto 1.2rem;
+  }
+
+  .hero-meta {
+    color: var(--text-dim);
+    font-size: .9rem;
+    line-height: 1.8;
+  }
+
+  .hero-meta strong { color: var(--text); font-weight: 600; }
+
+  /* ── Navigation ── */
+  .toc {
+    position: sticky;
+    top: 0;
+    z-index: 100;
+    background: #0a0a0fdd;
+    backdrop-filter: blur(16px);
+    -webkit-backdrop-filter: blur(16px);
+    border-bottom: 1px solid var(--border);
+    padding: 0 2rem;
+    overflow-x: auto;
+  }
+
+  .toc-inner {
+    max-width: 1100px;
+    margin: 0 auto;
+    display: flex;
+    gap: .25rem;
+    padding: .5rem 0;
+  }
+
+  .toc a {
+    flex-shrink: 0;
+    padding: .45rem .85rem;
+    border-radius: 8px;
+    font-size: .78rem;
+    font-weight: 500;
+    color: var(--text-dim);
+    text-decoration: none;
+    transition: all .2s;
+    white-space: nowrap;
+  }
+
+  .toc a:hover {
+    color: var(--text);
+    background: var(--bg-card);
+  }
+
+  /* ── Container ── */
+  .container {
+    max-width: 1100px;
+    margin: 0 auto;
+    padding: 0 2rem;
+  }
+
+  /* ── Sections ── */
+  section {
+    padding: 4rem 0 2rem;
+  }
+
+  section + section {
+    border-top: 1px solid var(--border);
+  }
+
+  h2 {
+    font-size: 1.75rem;
+    font-weight: 800;
+    letter-spacing: -.025em;
+    margin-bottom: .75rem;
+    color: #fff;
+  }
+
+  h2 .num {
+    color: var(--accent);
+    font-weight: 700;
+    margin-right: .3rem;
+  }
+
+  h3 {
+    font-size: 1.15rem;
+    font-weight: 700;
+    margin: 2.2rem 0 .8rem;
+    color: #ddd;
+    letter-spacing: -.01em;
+  }
+
+  h4 {
+    font-size: .95rem;
+    font-weight: 600;
+    margin: 1.5rem 0 .5rem;
+    color: var(--text-dim);
+    text-transform: uppercase;
+    letter-spacing: .05em;
+  }
+
+  p {
+    margin-bottom: 1rem;
+    color: var(--text);
+  }
+
+  .lead {
+    font-size: 1.1rem;
+    color: var(--text-dim);
+    max-width: 750px;
+    line-height: 1.75;
+    margin-bottom: 2rem;
+  }
+
+  .callout {
+    padding: 1rem 1.25rem;
+    border-radius: 10px;
+    margin: 1.5rem 0;
+    font-size: .95rem;
+    line-height: 1.65;
+  }
+
+  .callout-accent {
+    background: var(--accent-glow);
+    border-left: 3px solid var(--accent);
+    color: #c8c0f8;
+  }
+
+  .callout-green {
+    background: var(--green-dim);
+    border-left: 3px solid var(--green);
+    color: #a7f3d0;
+  }
+
+  .callout-yellow {
+    background: var(--yellow-dim);
+    border-left: 3px solid var(--yellow);
+    color: #fde68a;
+  }
+
+  .callout strong { color: #fff; }
+
+  /* ── Stat cards ── */
+  .stats {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 1rem;
+    margin: 2rem 0;
+  }
+
+  .stat {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1.25rem 1.25rem 1rem;
+    transition: border-color .2s;
+  }
+
+  .stat:hover { border-color: var(--border-accent); }
+
+  .stat-value {
+    font-size: 2rem;
+    font-weight: 800;
+    letter-spacing: -.03em;
+    line-height: 1.1;
+    margin-bottom: .35rem;
+  }
+
+  .stat-value.green { color: var(--green); }
+  .stat-value.red { color: var(--red); }
+  .stat-value.blue { color: var(--blue); }
+  .stat-value.accent { color: var(--accent); }
+  .stat-value.yellow { color: var(--yellow); }
+  .stat-value.cyan { color: var(--cyan); }
+
+  .stat-label {
+    font-size: .78rem;
+    color: var(--text-dim);
+    font-weight: 500;
+    line-height: 1.4;
+  }
+
+  .stat-change {
+    font-size: .72rem;
+    font-weight: 600;
+    margin-top: .35rem;
+    display: inline-block;
+    padding: .15rem .5rem;
+    border-radius: 100px;
+  }
+
+  .stat-change.good { background: var(--green-dim); color: var(--green); }
+  .stat-change.bad { background: var(--red-dim); color: var(--red); }
+  .stat-change.neutral { background: var(--blue-dim); color: var(--blue); }
+
+  /* ── Tables ── */
+  .table-wrap {
+    overflow-x: auto;
+    margin: 1.25rem 0;
+    border-radius: 12px;
+    border: 1px solid var(--border);
+  }
+
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: .85rem;
+  }
+
+  thead {
+    background: #16162a;
+  }
+
+  th {
+    text-align: left;
+    padding: .75rem 1rem;
+    font-weight: 600;
+    color: var(--text-dim);
+    font-size: .75rem;
+    text-transform: uppercase;
+    letter-spacing: .06em;
+    white-space: nowrap;
+    border-bottom: 1px solid var(--border);
+  }
+
+  td {
+    padding: .65rem 1rem;
+    border-bottom: 1px solid #1a1a28;
+    vertical-align: top;
+    color: var(--text);
+  }
+
+  tr:last-child td { border-bottom: none; }
+  tr:hover td { background: #16162220; }
+
+  td del {
+    color: var(--text-muted);
+    text-decoration: line-through;
+    text-decoration-color: var(--red);
+  }
+
+  td strong { color: #fff; font-weight: 600; }
+
+  .tag {
+    display: inline-block;
+    padding: .15rem .55rem;
+    border-radius: 6px;
+    font-size: .72rem;
+    font-weight: 600;
+    white-space: nowrap;
+  }
+
+  .tag-green { background: var(--green-dim); color: var(--green); }
+  .tag-red { background: var(--red-dim); color: var(--red); }
+  .tag-yellow { background: var(--yellow-dim); color: var(--yellow); }
+  .tag-blue { background: var(--blue-dim); color: var(--blue); }
+  .tag-accent { background: var(--accent-glow); color: var(--accent); }
+
+  /* ── Code ── */
+  pre {
+    background: var(--bg-code);
+    border: 1px solid var(--border);
+    border-radius: 10px;
+    padding: 1.25rem 1.5rem;
+    overflow-x: auto;
+    margin: 1rem 0;
+    font-size: .82rem;
+    line-height: 1.7;
+  }
+
+  code {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: .82em;
+  }
+
+  p code, li code, td code {
+    background: var(--bg-code);
+    padding: .15rem .45rem;
+    border-radius: 5px;
+    color: var(--cyan);
+    border: 1px solid var(--border);
+    font-size: .8em;
+  }
+
+  /* ── Lists ── */
+  ul, ol {
+    padding-left: 1.5rem;
+    margin-bottom: 1rem;
+  }
+
+  li {
+    margin-bottom: .4rem;
+    color: var(--text);
+    line-height: 1.65;
+  }
+
+  li strong { color: #fff; }
+
+  /* ── Feature cards ── */
+  .feature-header {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+    flex-wrap: wrap;
+  }
+
+  .feature-num {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    width: 48px;
+    height: 48px;
+    border-radius: 14px;
+    font-size: 1.3rem;
+    font-weight: 800;
+    flex-shrink: 0;
+  }
+
+  .feature-num-1 { background: var(--green-dim); color: var(--green); }
+  .feature-num-2 { background: var(--accent-glow); color: var(--accent); }
+  .feature-num-3 { background: var(--red-dim); color: var(--red); }
+  .feature-num-4 { background: var(--blue-dim); color: var(--blue); }
+
+  .feature-title {
+    font-size: 1.5rem;
+    font-weight: 800;
+    color: #fff;
+    letter-spacing: -.02em;
+  }
+
+  /* ── Pipeline ── */
+  .pipeline {
+    display: flex;
+    gap: .5rem;
+    align-items: center;
+    flex-wrap: wrap;
+    margin: 1.25rem 0;
+  }
+
+  .pipeline-step {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 10px;
+    padding: .6rem 1rem;
+    font-size: .82rem;
+    font-weight: 500;
+    color: var(--text);
+    position: relative;
+  }
+
+  .pipeline-step .num {
+    color: var(--accent);
+    font-weight: 700;
+    margin-right: .3rem;
+  }
+
+  .pipeline-arrow {
+    color: var(--text-muted);
+    font-size: 1.2rem;
+    flex-shrink: 0;
+  }
+
+  /* ── Gantt ── */
+  .gantt {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1.5rem;
+    margin: 1.5rem 0;
+    overflow-x: auto;
+  }
+
+  .gantt-row {
+    display: grid;
+    grid-template-columns: 200px 1fr;
+    align-items: center;
+    gap: 1rem;
+    margin-bottom: .6rem;
+    font-size: .82rem;
+  }
+
+  .gantt-label {
+    font-weight: 600;
+    color: var(--text);
+    white-space: nowrap;
+  }
+
+  .gantt-bar-wrap {
+    height: 28px;
+    position: relative;
+  }
+
+  .gantt-bar {
+    height: 100%;
+    border-radius: 6px;
+    position: absolute;
+    top: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: .7rem;
+    font-weight: 600;
+    color: #fff;
+    min-width: 60px;
+  }
+
+  /* ── Arch diagram ── */
+  .arch-diagram {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 2rem;
+    margin: 1.5rem 0;
+    text-align: center;
+  }
+
+  .arch-row {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    gap: .75rem;
+    margin-bottom: 1rem;
+    flex-wrap: wrap;
+  }
+
+  .arch-box {
+    padding: .6rem 1.1rem;
+    border-radius: 10px;
+    font-size: .8rem;
+    font-weight: 600;
+    border: 1px solid;
+    white-space: nowrap;
+  }
+
+  .arch-box-primary { background: #7c6aef20; border-color: #7c6aef60; color: var(--accent); }
+  .arch-box-service { background: #34d39920; border-color: #34d39960; color: var(--green); }
+  .arch-box-api { background: #60a5fa20; border-color: #60a5fa60; color: var(--blue); }
+  .arch-box-storage { background: #fbbf2420; border-color: #fbbf2460; color: var(--yellow); }
+
+  .arch-arrow {
+    color: var(--text-muted);
+    font-size: 1.2rem;
+  }
+
+  .arch-arrow-down {
+    display: block;
+    color: var(--text-muted);
+    font-size: 1.2rem;
+    margin: .3rem 0;
+  }
+
+  /* ── Risk items ── */
+  .risk {
+    padding: .85rem 1rem;
+    border-radius: 10px;
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    margin-bottom: .6rem;
+    font-size: .88rem;
+    line-height: 1.6;
+  }
+
+  .risk strong { color: var(--yellow); }
+
+  /* ── Comparison cards ── */
+  .comparison {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 1rem;
+    margin: 1.5rem 0;
+  }
+
+  @media (max-width: 700px) {
+    .comparison { grid-template-columns: 1fr; }
+    .gantt-row { grid-template-columns: 140px 1fr; }
+    .stats { grid-template-columns: repeat(2, 1fr); }
+  }
+
+  .comparison-card {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1.25rem;
+  }
+
+  .comparison-card h4 {
+    margin-top: 0;
+    font-size: .82rem;
+  }
+
+  .comparison-card p {
+    font-size: .88rem;
+    color: var(--text-dim);
+    margin: 0;
+  }
+
+  /* ── Footer ── */
+  footer {
+    padding: 3rem 2rem;
+    text-align: center;
+    color: var(--text-muted);
+    font-size: .82rem;
+    border-top: 1px solid var(--border);
+  }
+
+  footer a {
+    color: var(--accent);
+    text-decoration: none;
+  }
+
+  /* ── Scrollbar ── */
+  ::-webkit-scrollbar { width: 6px; height: 6px; }
+  ::-webkit-scrollbar-track { background: transparent; }
+  ::-webkit-scrollbar-thumb { background: var(--border-accent); border-radius: 3px; }
+</style>
+</head>
+<body>
+
+<!-- ═══════ HERO ═══════ -->
+<header class="hero">
+  <div class="hero-badge">v2 &mdash; API-First Architecture</div>
+  <h1>Дорожная карта видеофич</h1>
+  <div class="hero-meta">
+    <strong>Техническая консультация</strong> &middot; 22 марта 2026<br>
+    ML/AI-инженер &middot; Backend-архитектор &middot; Remotion-инженер &middot; Frontend-архитектор &middot; DevOps &middot; Performance
+  </div>
+</header>
+
+<!-- ═══════ NAV ═══════ -->
+<nav class="toc">
+  <div class="toc-inner">
+    <a href="#delta">v1 &rarr; v2</a>
+    <a href="#overview">Обзор</a>
+    <a href="#f1">1. Шаблоны</a>
+    <a href="#f2">2. Вирусные моменты</a>
+    <a href="#f3">3. Трекинг лица</a>
+    <a href="#f4">4. Shorts 9:16</a>
+    <a href="#order">Порядок</a>
+    <a href="#cost">Стоимость</a>
+    <a href="#infra">Инфраструктура</a>
+    <a href="#stack">Стек</a>
+    <a href="#issues">Проблемы</a>
+  </div>
+</nav>
+
+<div class="container">
+
+<!-- ═══════ DELTA ═══════ -->
+<section id="delta">
+  <h2>Что изменилось по сравнению с v1</h2>
+  <p class="lead">
+    Одно принципиальное решение перевернуло всю архитектуру: вместо локальных ML-моделей — управляемые API-сервисы.
+    PyTorch, GPU-инфраструктура, разделение ML-воркеров, большинство проблем с памятью и временем обработки — всё это просто исчезло.
+  </p>
+
+  <h3>Замены API</h3>
+  <div class="table-wrap">
+    <table>
+      <thead>
+        <tr><th>v1 (локальный ML)</th><th>v2 (API-First)</th><th>Эффект</th></tr>
+      </thead>
+      <tbody>
+        <tr><td>Локальный Whisper (PyTorch, 20-60 мин CPU)</td><td><strong>Deepgram Nova-3</strong> API (~30 сек)</td><td>PyTorch больше не нужен вообще</td></tr>
+        <tr><td>Локальный pyannote.audio (15-30 мин CPU)</td><td><strong>Deepgram</strong> <code>diarize=true</code></td><td>pyannote + torchaudio — удалены</td></tr>
+        <tr><td>Gemini 2.5 Flash / GPT-4o-mini</td><td><strong>GigaChat Pro</strong> (Сбер)</td><td>Нативный русский: юмор, сленг, контекст</td></tr>
+        <tr><td>librosa (энергия аудио)</td><td><strong>Deepgram</strong> <code>sentiment=true</code></td><td>Сентимент заменяет анализ энергии</td></tr>
+        <tr><td>&mdash;</td><td><strong>DeepInfra</strong> (Llama, Mistral, Qwen)</td><td>Фоллбэк / A/B-тестирование LLM</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Ключевые метрики</h3>
+  <div class="stats">
+    <div class="stat">
+      <div class="stat-value green">-75%</div>
+      <div class="stat-label">Размер Docker-образа</div>
+      <div class="stat-change good">1.72 ГБ &rarr; 400-500 МБ</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">-95%</div>
+      <div class="stat-label">Пиковое потребление RAM</div>
+      <div class="stat-change good">8-16 ГБ &rarr; 400 МБ</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">-85%</div>
+      <div class="stat-label">Время обработки</div>
+      <div class="stat-change good">35-80 мин &rarr; 5-10 мин</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value yellow">+80%</div>
+      <div class="stat-label">Стоимость за видео</div>
+      <div class="stat-change bad">$0.11 &rarr; $0.20 (API)</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">-88%</div>
+      <div class="stat-label">Новые Python-зависимости</div>
+      <div class="stat-change good">310-340 МБ &rarr; 40 МБ</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value cyan">Никогда</div>
+      <div class="stat-label">Нужен GPU?</div>
+      <div class="stat-change good">Полностью исключён</div>
+    </div>
+  </div>
+
+  <div class="callout callout-accent">
+    Образ в четыре раза легче. RAM в сорок раз меньше. Обработка в шесть-восемь раз быстрее. Цена за единицу чуть выше, но инфраструктурные расходы сжимаются до нуля. Неплохой обмен.
+  </div>
+
+  <h3>Проблемы, которых больше нет</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема из v1</th><th>Почему исчезла</th></tr></thead>
+      <tbody>
+        <tr><td><del>Переключить PyTorch на CPU-only индекс</del></td><td>PyTorch удалён полностью</td></tr>
+        <tr><td><del>OOM воркера на параллельных ML-джобах</del></td><td>Нет тяжёлого ML — стандартный воркер 4 ГБ</td></tr>
+        <tr><td><del>Отдельный Docker-образ для ML-воркера</del></td><td>Один лёгкий образ</td></tr>
+        <tr><td><del>Планирование GPU-инфраструктуры</del></td><td>Весь ML через API</td></tr>
+        <tr><td><del>Конфликты версий PyTorch</del></td><td>Нет PyTorch</td></tr>
+        <tr><td><del>Скачивание моделей при первом запуске</del></td><td>Нет локальных моделей (кроме MediaPipe, ~2 МБ)</td></tr>
+        <tr><td><del>Docker Compose profiles для ML</del></td><td>Не нужно</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Новые проблемы</h3>
+  <p style="color: var(--text-dim); margin-bottom: .75rem;">Ничего бесплатного не бывает. Вот что появилось взамен:</p>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема</th><th>Приоритет</th><th>Митигация</th></tr></thead>
+      <tbody>
+        <tr><td>Управление API-ключами (3 сервиса)</td><td><span class="tag tag-red">Высокий</span></td><td>Через env-переменные в settings, никогда в коде</td></tr>
+        <tr><td>Rate limit'ы API</td><td><span class="tag tag-red">Высокий</span></td><td>Retry с exponential backoff в акторах</td></tr>
+        <tr><td>Vendor lock-in</td><td><span class="tag tag-yellow">Средний</span></td><td>Абстрагировать за интерфейсами движков</td></tr>
+        <tr><td>API упал = обработка встала</td><td><span class="tag tag-yellow">Средний</span></td><td>Whisper как опциональный фоллбэк</td></tr>
+        <tr><td>$0.20 vs $0.11 за видео</td><td><span class="tag tag-blue">Низкий</span></td><td>Нулевые инфраструктурные расходы; прибыльно на любом SaaS-тарифе</td></tr>
+      </tbody>
+    </table>
+  </div>
+</section>
+
+<!-- ═══════ OVERVIEW ═══════ -->
+<section id="overview">
+  <h2>Общая картина</h2>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>#</th><th>Фича</th><th>Сложность</th><th>MVP</th><th>Полная</th><th>Доп. инфраструктура</th></tr></thead>
+      <tbody>
+        <tr><td><strong>1</strong></td><td>Продвинутые шаблоны Remotion</td><td><span class="tag tag-green">Легко-средне</span></td><td>3-4 дня</td><td>3-4 дня</td><td>Ничего</td></tr>
+        <tr><td><strong>2</strong></td><td>Детекция вирусных моментов</td><td><span class="tag tag-yellow">Средне</span></td><td><strong>3-5 дней</strong></td><td>6-10 дней</td><td>API-ключи</td></tr>
+        <tr><td><strong>3</strong></td><td>Авто-монтаж и трекинг лица</td><td><span class="tag tag-red">Сложно</span></td><td><strong>8-10 дней</strong></td><td>20-30 дней</td><td>MediaPipe (~30 МБ)</td></tr>
+        <tr><td><strong>4</strong></td><td>Shorts 9:16</td><td><span class="tag tag-yellow">Средне</span></td><td>6-8 дней</td><td>+3-4 дня после #3</td><td>Ничего</td></tr>
+        <tr style="background: #16162a;"><td></td><td><strong>Итого</strong></td><td></td><td><strong>20-27 дней</strong></td><td><strong>35-47 дней</strong></td><td></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-green">
+    Реалистичный прогноз для одного разработчика: <strong>5-7 недель</strong> (все MVP) или <strong>2-3 месяца</strong> (полные версии).
+  </div>
+</section>
+
+<!-- ═══════ FEATURE 1 ═══════ -->
+<section id="f1">
+  <div class="feature-header">
+    <div class="feature-num feature-num-1">1</div>
+    <div class="feature-title">Продвинутые шаблоны Remotion</div>
+  </div>
+
+  <div class="callout callout-green">
+    <strong>Без изменений по сравнению с v1.</strong> Эта фича не зависит от ML. Спецификация и план уже написаны &mdash; бери и делай.
+  </div>
+
+  <p><strong>Что делаем:</strong> Расширяем <code>CaptionStyleSchema</code> четырьмя новыми стилями подсветки (<code>pop_in</code>, <code>karaoke</code>, <code>bounce</code>, <code>glow_pulse</code>), двумя переходами (<code>zoom_in</code>, <code>drop_in</code>), тремя полями. Два системных пресета: "Shorts" и "Podcast".</p>
+
+  <p><strong>Где трогаем код:</strong> Расширение схемы в Remotion + бэкенде, логика рендеринга в <code>Captions.tsx</code>, Alembic-миграция для пресетов, контролы в StyleEditor на фронте.</p>
+</section>
+
+<!-- ═══════ FEATURE 2 ═══════ -->
+<section id="f2">
+  <div class="feature-header">
+    <div class="feature-num feature-num-2">2</div>
+    <div class="feature-title">Детекция вирусных моментов</div>
+  </div>
+
+  <h3>Архитектура (v2 &mdash; API-First)</h3>
+  <p class="lead">
+    В v1 мы планировали гонять LLM по тексту и считать энергию аудио через librosa.
+    В v2 подход элегантнее: один вызов Deepgram даёт транскрипцию, разметку спикеров и сентимент-анализ &mdash; три результата за одну цену.
+    А текст анализирует GigaChat, которому русский язык родной.
+  </p>
+
+  <p><strong>Транскрипция:</strong> Deepgram Nova-3 API с <code>diarize=true</code> + <code>sentiment=true</code>. Один вызов &mdash; пословные таймстемпы, метки спикеров, оценка сентимента. Стоимость: $0.0053/мин ($0.16 за 30-минутное видео). Обработка: ~30 секунд.</p>
+
+  <p><strong>LLM-анализ:</strong> GigaChat Pro (Сбер) &mdash; нативная русскоязычная LLM. Лучше ловит русский юмор, культурные отсылки, сленг и вирусные паттерны. Фоллбэк: DeepInfra (Llama 3.1 70B или Qwen) для A/B-тестирования.</p>
+
+  <p><strong>Аудио-подкрепление:</strong> Сентимент от Deepgram заменяет librosa. Высокий сентимент коррелирует с вирусными моментами.</p>
+
+  <h4>Пайплайн</h4>
+  <div class="pipeline">
+    <div class="pipeline-step"><span class="num">1</span> Deepgram транскрипция</div>
+    <div class="pipeline-arrow">&rarr;</div>
+    <div class="pipeline-step"><span class="num">2</span> Конвертация в Document</div>
+    <div class="pipeline-arrow">&rarr;</div>
+    <div class="pipeline-step"><span class="num">3</span> GigaChat анализ</div>
+    <div class="pipeline-arrow">&rarr;</div>
+    <div class="pipeline-step"><span class="num">4</span> Постобработка</div>
+    <div class="pipeline-arrow">&rarr;</div>
+    <div class="pipeline-step"><span class="num">5</span> Сохранение в clips</div>
+  </div>
+
+  <h3>Бэкенд</h3>
+  <p><strong>Новый модуль:</strong> <code>clips</code> (models, schemas, repository, service, router).</p>
+
+  <h4>Модель клипа</h4>
+  <pre><code>Clip {
+  project_id: UUID (FK projects)
+  source_file_id: UUID (FK files)
+  job_id: UUID? (FK jobs)
+  title: str
+  start_ms: int
+  end_ms: int
+  score: float
+  source_type: "viral_detected" | "user_created" | "auto_generated"
+  status: "pending" | "approved" | "rejected" | "exported"
+  meta: JSON? (рассуждения LLM, теги, хэштеги, данные сентимента)
+}</code></pre>
+
+  <p><strong>Новый тип джоба:</strong> <code>VIRAL_DETECT</code> в <code>JobTypeEnum</code>. GigaChat через <code>httpx</code> из Dramatiq-воркера.</p>
+  <p><strong>Расширение движков:</strong> <code>engine: "whisper" | "google" | "deepgram"</code>. Deepgram &mdash; дефолт.</p>
+
+  <h4>Интеграция с LLM</h4>
+  <ul>
+    <li>GigaChat API через <code>httpx</code> (OAuth2 token auth через Sber ID)</li>
+    <li>DeepInfra как фоллбэк (OpenAI-совместимый API)</li>
+    <li>Промпты в <code>cpv3/infrastructure/prompts/viral_detection_v1.txt</code></li>
+    <li>Настройки: <code>GIGACHAT_CLIENT_ID</code>, <code>GIGACHAT_CLIENT_SECRET</code>, <code>DEEPINFRA_API_KEY</code>, <code>DEEPGRAM_API_KEY</code></li>
+  </ul>
+
+  <h3>Ключевые цифры</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Метрика</th><th>v1</th><th>v2</th></tr></thead>
+      <tbody>
+        <tr><td>Время транскрипции</td><td>Зависит от Whisper</td><td><strong>~30 сек</strong> (Deepgram)</td></tr>
+        <tr><td>Время LLM-анализа</td><td>10-20 сек</td><td>10-20 сек</td></tr>
+        <tr><td>Общее время</td><td>10-20 сек (после транскрипции)</td><td><strong>40-50 сек</strong></td></tr>
+        <tr><td>Стоимость за видео</td><td>~$0.005</td><td><strong>~$0.17</strong></td></tr>
+        <tr><td>Точность (precision)</td><td>50-70%</td><td><strong>60-80%</strong></td></tr>
+        <tr><td>Новые зависимости</td><td>~30 МБ</td><td><strong>~0 МБ</strong></td></tr>
+        <tr><td>Срок MVP</td><td>5-7 дней</td><td><strong>3-5 дней</strong></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-accent">
+    Стоимость выросла с полкопейки до семнадцати центов &mdash; но зато ноль зависимостей, ноль локальных моделей и на два дня быстрее в разработке.
+  </div>
+
+  <h3>Риски</h3>
+  <div class="risk"><strong>Доступность GigaChat API</strong> &mdash; uptime может быть ниже, чем у Google/OpenAI. Митигация: фоллбэк на DeepInfra.</div>
+  <div class="risk"><strong>Structured output GigaChat</strong> &mdash; проверить JSON mode / function calling. Тестировать рано.</div>
+  <div class="risk"><strong>WER Deepgram на русском</strong> &mdash; ~10-12% (Nova-3). Сопоставимо с Whisper <code>medium</code>. Достаточно.</div>
+  <div class="risk"><strong>Визуальные моменты</strong> по-прежнему не ловятся (~20-30%).</div>
+
+  <h3>MVP vs Полная версия</h3>
+  <div class="comparison">
+    <div class="comparison-card">
+      <h4>MVP &mdash; 3-5 дней</h4>
+      <p>Deepgram + GigaChat. Клипы со скорами. Пользователь ревьюит. Без анализа энергии.</p>
+    </div>
+    <div class="comparison-card">
+      <h4>Полная &mdash; 6-10 дней</h4>
+      <p>Сентимент-скоринг, few-shot тюнинг, пакетная обработка, экспорт в 9:16, A/B через DeepInfra.</p>
+    </div>
+  </div>
+</section>
+
+<!-- ═══════ FEATURE 3 ═══════ -->
+<section id="f3">
+  <div class="feature-header">
+    <div class="feature-num feature-num-3">3</div>
+    <div class="feature-title">Авто-монтаж и трекинг лица</div>
+  </div>
+
+  <p class="lead">
+    В v1 эта фича была монстром: pyannote на CPU 30 минут жуёт аудио, PyTorch конфликтует с Whisper, GPU-воркеры, 16 ГБ RAM.
+    В v2 всё, что касалось диаризации, ушло в один API-вызов Deepgram. Осталась только детекция лиц через MediaPipe &mdash; лёгкая библиотека, работающая на CPU за минуту-две.
+  </p>
+
+  <h3>Архитектура (v2 &mdash; API-First)</h3>
+  <p><strong>Детекция лиц:</strong> MediaPipe BlazeFace. Apache 2.0, ~2МБ, 30-60 FPS на CPU. Сэмплируем на 3 FPS. <strong>Единственный оставшийся локальный ML-компонент.</strong></p>
+  <p><strong>Диаризация спикеров:</strong> Deepgram API с <code>diarize=true</code> (~30 сек на 30-мин видео). Полностью заменяет pyannote.</p>
+
+  <h4>Маппинг лицо-спикер</h4>
+  <div class="comparison">
+    <div class="comparison-card">
+      <h4>Фаза 1</h4>
+      <p>Временная корреляция: треки лиц &times; сегменты спикеров. 70-85% точности. ~100 строк Python. Ноль зависимостей.</p>
+    </div>
+    <div class="comparison-card">
+      <h4>Фаза 2</h4>
+      <p>TalkNet-ASD (анализ губ + аудио). ~92% точности. Нужен GPU. Можно откладывать бесконечно.</p>
+    </div>
+  </div>
+
+  <p><strong>Видео-композитинг:</strong> CSS <code>transform: scale() translate()</code> в Remotion. GPU-ускоренная браузерная операция &mdash; бесплатная по производительности.</p>
+
+  <h4>Remotion-композиции</h4>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Композиция</th><th>Назначение</th><th>Фаза</th></tr></thead>
+      <tbody>
+        <tr><td><code>CaptionedVideo</code></td><td>Субтитры на нативном видео</td><td><span class="tag tag-green">Текущая</span></td></tr>
+        <tr><td><code>ShortsVideo</code></td><td>Кроп + субтитры в 9:16</td><td><span class="tag tag-blue">Фича 4</span></td></tr>
+        <tr><td><code>AutoEditVideo</code></td><td>Трекинг лица + монтаж + субтитры</td><td><span class="tag tag-accent">Фича 3</span></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Время обработки (30-мин 1080p видео)</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Шаг</th><th>v1 (CPU)</th><th>v2 (API-First)</th></tr></thead>
+      <tbody>
+        <tr><td>Транскрипция + диаризация Deepgram</td><td>&mdash;</td><td><strong>~30 сек</strong></td></tr>
+        <tr><td>Детекция лиц (MediaPipe, 3 FPS)</td><td>1-2 мин</td><td>1-2 мин</td></tr>
+        <tr><td><del>Диаризация (pyannote)</del></td><td><del>15-30 мин</del></td><td><strong>Включено в Deepgram</strong></td></tr>
+        <tr><td>Маппинг лицо-спикер</td><td>&lt; 1 сек</td><td>&lt; 1 сек</td></tr>
+        <tr><td>Remotion рендер</td><td>10-30 мин</td><td>10-30 мин</td></tr>
+        <tr style="background: #16162a;"><td><strong>Итого</strong></td><td><strong>35-80 мин</strong></td><td><strong>12-33 мин</strong></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-green">
+    <strong>Бутылочное горлышко в 15-30 минут на диаризацию &mdash; полностью ликвидировано.</strong>
+  </div>
+
+  <h3>Требования к памяти</h3>
+  <div class="stats">
+    <div class="stat">
+      <div class="stat-value red">8-16 ГБ</div>
+      <div class="stat-label">v1 &mdash; пиковое RAM</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">~400 МБ</div>
+      <div class="stat-label">v2 &mdash; пиковое RAM</div>
+    </div>
+  </div>
+
+  <div class="callout callout-accent">
+    С 16 гигабайт до 400 мегабайт. В сорок раз. Это не оптимизация &mdash; это другая категория задач.
+  </div>
+
+  <h3>Ключевые цифры</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Метрика</th><th>v1</th><th>v2</th></tr></thead>
+      <tbody>
+        <tr><td>Время диаризации</td><td>15-30 мин (CPU) / 1-2 мин (GPU)</td><td><strong>~30 сек</strong> (API)</td></tr>
+        <tr><td>Время детекции лиц</td><td>1-2 мин</td><td>1-2 мин</td></tr>
+        <tr><td>Общее время анализа</td><td>17-33 мин (CPU)</td><td><strong>~2 мин</strong></td></tr>
+        <tr><td>Полный пайплайн</td><td>35-80 мин (CPU)</td><td><strong>12-33 мин</strong></td></tr>
+        <tr><td>Пиковое RAM</td><td>8-16 ГБ</td><td><strong>~400 МБ</strong></td></tr>
+        <tr><td>Зависимости</td><td>~280 МБ</td><td><strong>~30 МБ</strong></td></tr>
+        <tr><td>GPU нужен?</td><td>Фаза 2 рекомендуется</td><td><strong>Никогда</strong></td></tr>
+        <tr><td>Срок MVP</td><td>12-15 дней</td><td><strong>8-10 дней</strong></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Риски</h3>
+  <div class="risk"><strong>Маппинг лицо-спикер</strong> &mdash; 70-85% точности. Каждое пятое назначение может быть неверным. Пользователь должен поправить вручную.</div>
+  <div class="risk"><strong>Deepgram DER</strong> &mdash; ~12-15% vs ~10% у pyannote. Приемлемо.</div>
+  <div class="risk"><strong>Потеря качества при кропе</strong> &mdash; без изменений.</div>
+  <div class="risk"><strong>TalkNet-ASD отложен</strong> &mdash; если нужен GPU, разберёмся когда дойдём.</div>
+
+  <h3>MVP vs Полная версия</h3>
+  <div class="comparison">
+    <div class="comparison-card">
+      <h4>MVP &mdash; 8-10 дней</h4>
+      <p>Детекция лиц + Deepgram спикеры. Временная корреляция. Ручная коррекция. Статический кроп.</p>
+    </div>
+    <div class="comparison-card">
+      <h4>Полная &mdash; 20-30 дней</h4>
+      <p>Динамический кроп. Плавные переходы. Сплит-скрин. Мульти-спикер. Опциональный TalkNet-ASD.</p>
+    </div>
+  </div>
+</section>
+
+<!-- ═══════ FEATURE 4 ═══════ -->
+<section id="f4">
+  <div class="feature-header">
+    <div class="feature-num feature-num-4">4</div>
+    <div class="feature-title">Конвертация в Shorts (9:16)</div>
+  </div>
+
+  <div class="callout callout-green"><strong>Без изменений по сравнению с v1.</strong> Не зависит от ML.</div>
+
+  <p><strong>Пайплайн:</strong> Сначала кроп, потом субтитры &mdash; всегда. Один проход Remotion через <code>ShortsVideo</code>.</p>
+
+  <h4>Спецификация кропа</h4>
+  <pre><code>type CropConfig = {
+  mode: "static" | "keyframe";
+  staticCrop?: { x: number; y: number; zoom: number };
+  keyframes?: Array&lt;{ time: number; x: number; y: number; zoom: number }&gt;;
+  interpolation?: "linear" | "ease" | "smooth";
+};</code></pre>
+
+  <h3>Бэкенд</h3>
+  <p><strong>Новый джоб:</strong> <code>ASPECT_CONVERT</code>. Функция <code>crop_to_vertical()</code> в <code>media/service.py</code>.</p>
+  <p><strong>Новый артефакт:</strong> <code>VERTICAL_VIDEO</code> в <code>ArtifactTypeEnum</code>.</p>
+
+  <h3>Фронтенд</h3>
+  <ul>
+    <li>Перетаскиваемый прямоугольник 9:16 поверх видеоплеера</li>
+    <li>Side-by-side превью: оригинал vs обрезанное</li>
+    <li>Кнопка &laquo;Конвертировать в Short&raquo; на одобренных вирусных клипах</li>
+    <li>Автозаполнение кропа из данных детекции лица</li>
+  </ul>
+
+  <h3>Время обработки</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Подход</th><th>Время (30-мин видео)</th></tr></thead>
+      <tbody>
+        <tr><td>FFmpeg кроп (без субтитров)</td><td>12-36 мин</td></tr>
+        <tr><td>Remotion кроп + субтитры</td><td>11-45 мин</td></tr>
+        <tr><td>FFmpeg с NVENC</td><td>3-5 мин</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>MVP vs Полная версия</h3>
+  <div class="comparison">
+    <div class="comparison-card">
+      <h4>MVP &mdash; 6-8 дней</h4>
+      <p>Ручной выбор кропа с превью. Remotion-композиция <code>ShortsVideo</code>.</p>
+    </div>
+    <div class="comparison-card">
+      <h4>Полная &mdash; +3-4 дня после Фичи 3</h4>
+      <p>Авто-кроп по лицу. Один клик. Пакетный экспорт.</p>
+    </div>
+  </div>
+</section>
+
+<!-- ═══════ BUILD ORDER ═══════ -->
+<section id="order">
+  <h2>Рекомендуемый порядок разработки</h2>
+
+  <div class="gantt">
+    <div class="gantt-row">
+      <div class="gantt-label">Фича 1 &mdash; Шаблоны</div>
+      <div class="gantt-bar-wrap">
+        <div class="gantt-bar" style="left: 0%; width: 18%; background: linear-gradient(135deg, #34d399, #059669);">Нед 1-2</div>
+      </div>
+    </div>
+    <div class="gantt-row">
+      <div class="gantt-label">Фича 2 &mdash; Вирусная детекция</div>
+      <div class="gantt-bar-wrap">
+        <div class="gantt-bar" style="left: 14%; width: 14%; background: linear-gradient(135deg, #7c6aef, #5b4acf);">Нед 2-3</div>
+      </div>
+    </div>
+    <div class="gantt-row">
+      <div class="gantt-label">Фича 4 &mdash; 9:16 кроп</div>
+      <div class="gantt-bar-wrap">
+        <div class="gantt-bar" style="left: 23%; width: 22%; background: linear-gradient(135deg, #60a5fa, #3b82f6);">Нед 3-5</div>
+      </div>
+    </div>
+    <div class="gantt-row">
+      <div class="gantt-label">Фича 3 &mdash; Трекинг лица</div>
+      <div class="gantt-bar-wrap">
+        <div class="gantt-bar" style="left: 41%; width: 46%; background: linear-gradient(135deg, #f87171, #dc2626);">Нед 5-10</div>
+      </div>
+    </div>
+    <div class="gantt-row">
+      <div class="gantt-label">Фича 4 &mdash; апгрейд</div>
+      <div class="gantt-bar-wrap">
+        <div class="gantt-bar" style="left: 86%; width: 14%; background: linear-gradient(135deg, #60a5fa, #3b82f6);">Нед 10-11</div>
+      </div>
+    </div>
+  </div>
+
+  <h3>Почему именно так</h3>
+  <ol>
+    <li><strong>Шаблоны первыми</strong> &mdash; готовы, нулевой риск, моментальная польза</li>
+    <li><strong>Вирусная детекция второй</strong> &mdash; самый быстрый ROI (3-5 дней MVP), валидирует спрос</li>
+    <li><strong>9:16 MVP третьим</strong> &mdash; создаёт <code>ShortsVideo</code>, полезна сама по себе</li>
+    <li><strong>Трекинг лица последним</strong> &mdash; самая сложная, но теперь проще без pyannote/GPU</li>
+    <li><strong>Апгрейд 9:16</strong> &mdash; тривиален, когда трекинг даёт позиции</li>
+  </ol>
+</section>
+
+<!-- ═══════ COST ═══════ -->
+<section id="cost">
+  <h2>Анализ стоимости</h2>
+
+  <h3>Стоимость одного видео (30-мин, все фичи)</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Компонент</th><th>v1 (локальный ML)</th><th>v2 (API-First)</th></tr></thead>
+      <tbody>
+        <tr><td>Транскрипция + диаризация</td><td>$0.07 compute</td><td><strong>$0.16</strong> (Deepgram)</td></tr>
+        <tr><td>LLM вирусная детекция</td><td>$0.005 (Gemini)</td><td><strong>$0.01</strong> (GigaChat)</td></tr>
+        <tr><td>Детекция лиц</td><td>$0.002</td><td>$0.002</td></tr>
+        <tr><td>FFmpeg/Remotion рендер</td><td>$0.02</td><td>$0.02</td></tr>
+        <tr style="background: #16162a;"><td><strong>Итого</strong></td><td><strong>$0.11</strong></td><td><strong>$0.20</strong></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-accent">
+    Двадцать центов. За полный пайплайн: транскрипция, диаризация, поиск вирусных моментов, детекция лиц, рендер. <strong>Двадцать центов.</strong>
+  </div>
+
+  <h3>Сравнение месячных расходов</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Масштаб</th><th>v1</th><th>v2</th></tr></thead>
+      <tbody>
+        <tr><td>100 видео/мес</td><td>$11 + сервер + $0-380 GPU</td><td><strong>$20 API + сервер</strong></td></tr>
+        <tr><td>500 видео/мес</td><td>$255-435</td><td><strong>$100 + сервер</strong></td></tr>
+        <tr><td>1 000 видео/мес</td><td>$490</td><td><strong>$200 + сервер</strong></td></tr>
+        <tr><td>5 000 видео/мес</td><td>$930</td><td><strong>$1 000 + сервер</strong></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-yellow">
+    <strong>Точка безубыточности:</strong> ~2 000-3 000 видео/месяц. Ниже &mdash; API дешевле. А без GPU, ML-инфры и OOM-крэшей реальный breakeven ещё выше.
+  </div>
+
+  <h3>Предлагаемые тарифы SaaS</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Тариф</th><th>Цена</th><th>Ограничения</th><th>Себестоимость</th><th>Маржа</th></tr></thead>
+      <tbody>
+        <tr><td><strong>Free</strong></td><td>$0</td><td>Видео до 10 мин, 5/мес</td><td>~$0.07</td><td><span class="tag tag-blue">Маркетинг</span></td></tr>
+        <tr><td><strong>Pro</strong></td><td>$15-30/мес</td><td>До 30 мин, 50/мес</td><td>~$0.20</td><td><span class="tag tag-green">50-70%</span></td></tr>
+        <tr><td><strong>Business</strong></td><td>$50-100/мес</td><td>До 60 мин, 200/мес</td><td>~$0.35</td><td><span class="tag tag-green">65-80%</span></td></tr>
+      </tbody>
+    </table>
+  </div>
+</section>
+
+<!-- ═══════ INFRA ═══════ -->
+<section id="infra">
+  <h2>Инфраструктура (v2 &mdash; упрощённая)</h2>
+
+  <h3>Архитектура</h3>
+  <div class="arch-diagram">
+    <div class="arch-row">
+      <div class="arch-box arch-box-primary">Frontend</div>
+      <div class="arch-arrow">&rarr;</div>
+      <div class="arch-box arch-box-primary">Backend API</div>
+      <div class="arch-arrow">&rarr;</div>
+      <div class="arch-box arch-box-service">Dramatiq-воркер (MediaPipe)</div>
+    </div>
+    <div class="arch-arrow-down">&darr;</div>
+    <div class="arch-row">
+      <div class="arch-box arch-box-storage">PostgreSQL</div>
+      <div class="arch-box arch-box-storage">Redis</div>
+      <div class="arch-box arch-box-storage">S3/MinIO</div>
+      <div class="arch-box arch-box-service">Remotion</div>
+    </div>
+    <div class="arch-arrow-down">&darr;</div>
+    <div class="arch-row">
+      <div class="arch-box arch-box-api">Deepgram API</div>
+      <div class="arch-box arch-box-api">GigaChat API</div>
+      <div class="arch-box arch-box-api">DeepInfra</div>
+    </div>
+    <p style="margin-top: 1rem; color: var(--text-muted); font-size: .8rem;">Нет ML-воркера. Нет GPU. Нет Docker Compose profiles. Один воркер обрабатывает всё.</p>
+  </div>
+
+  <h3>Docker-образ</h3>
+  <div class="stats">
+    <div class="stat">
+      <div class="stat-value red">1.72 ГБ</div>
+      <div class="stat-label">v1: python + PyTorch + Whisper + CUDA</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">400-500 МБ</div>
+      <div class="stat-label">v2: python + mediapipe</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value red">16 ГБ</div>
+      <div class="stat-label">v1: рекомендованная RAM</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value green">4 ГБ</div>
+      <div class="stat-label">v2: достаточная RAM</div>
+    </div>
+  </div>
+
+  <h3>Разделение ML-сервиса не требуется</h3>
+  <p>MediaPipe (~30МБ, ~400МБ RAM) &mdash; это всё, что работает локально. Не нужны:</p>
+  <ul>
+    <li>Отдельный контейнер ML-воркера</li>
+    <li>Docker Compose profiles для ML</li>
+    <li>GPU-инфраструктура</li>
+    <li>Выделенные очереди Dramatiq для ML</li>
+  </ul>
+  <p>Стандартный воркер с <code>--processes 1 --threads 2</code> справляется со всем.</p>
+
+  <h4>Новые настройки</h4>
+  <pre><code># Deepgram
+deepgram_api_key: str = Field(default="", alias="DEEPGRAM_API_KEY")
+
+# GigaChat (Сбер)
+gigachat_client_id: str = Field(default="", alias="GIGACHAT_CLIENT_ID")
+gigachat_client_secret: str = Field(default="", alias="GIGACHAT_CLIENT_SECRET")
+
+# DeepInfra (фоллбэк LLM)
+deepinfra_api_key: str = Field(default="", alias="DEEPINFRA_API_KEY")
+
+# Конфигурация LLM
+llm_provider: str = Field(default="gigachat", alias="LLM_PROVIDER")
+llm_viral_prompt_version: str = Field(default="v1", alias="LLM_VIRAL_PROMPT_VERSION")</code></pre>
+</section>
+
+<!-- ═══════ STACK ═══════ -->
+<section id="stack">
+  <h2>Сводка по технологическому стеку</h2>
+
+  <h3>Новые зависимости (v2)</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Пакет</th><th>Размер</th><th>Назначение</th><th>Фича</th></tr></thead>
+      <tbody>
+        <tr><td><code>mediapipe</code></td><td>~30 МБ</td><td>Детекция лиц (CPU)</td><td>3</td></tr>
+        <tr><td><code>httpx</code></td><td>Уже установлен</td><td>API-вызовы</td><td>2, 3</td></tr>
+        <tr style="background: #16162a;"><td><strong>Итого</strong></td><td><strong>~30 МБ</strong></td><td></td><td></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Удалённые зависимости</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Пакет</th><th>Сэкономлено</th><th>Заменён на</th></tr></thead>
+      <tbody>
+        <tr><td><del><code>openai-whisper</code></del></td><td>~50 МБ + PyTorch ~2 ГБ</td><td>Deepgram</td></tr>
+        <tr><td><del><code>pyannote-audio</code></del></td><td>~200 МБ</td><td>Deepgram</td></tr>
+        <tr><td><del><code>torchaudio</code></del></td><td>~50-80 МБ</td><td>Deepgram</td></tr>
+        <tr><td><del><code>librosa</code></del></td><td>~20 МБ</td><td>Deepgram sentiment</td></tr>
+        <tr style="background: #16162a;"><td><strong>Итого удалено</strong></td><td><strong>~2.3 ГБ</strong></td><td></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-green">
+    Добавили 30 мегабайт. Удалили 2.3 гигабайта. <strong>Соотношение 1:77.</strong>
+  </div>
+
+  <h3>Новые компоненты</h3>
+  <div class="stats">
+    <div class="stat">
+      <div class="stat-value accent">clips</div>
+      <div class="stat-label">Бэкенд-модуль: CRUD + ревью клипов</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value blue">ShortsVideo</div>
+      <div class="stat-label">Remotion: кроп + субтитры 9:16</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value cyan">AutoEditVideo</div>
+      <div class="stat-label">Remotion: трекинг лица + субтитры</div>
+    </div>
+  </div>
+
+  <h4>Новые типы джобов</h4>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Тип</th><th>Назначение</th><th>Фича</th></tr></thead>
+      <tbody>
+        <tr><td><code>VIRAL_DETECT</code></td><td>GigaChat анализ транскрипции</td><td>2</td></tr>
+        <tr><td><code>ASPECT_CONVERT</code></td><td>9:16 кроп + пере-кодирование</td><td>4</td></tr>
+        <tr><td><code>FACE_DETECT</code></td><td>Детекция bounding box (MediaPipe)</td><td>3</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <div class="callout callout-yellow">
+    <code>SPEAKER_DIARIZE</code> <strong>больше не является отдельным типом джоба</strong> &mdash; диаризация включена в транскрипцию Deepgram.
+  </div>
+
+  <h4>Расширение движков транскрипции</h4>
+  <pre><code>engine: Literal["whisper", "google", "deepgram"] = "deepgram"</code></pre>
+  <p>Deepgram &mdash; дефолт. Whisper &mdash; опциональный фоллбэк (<code>uv sync --group whisper</code>).</p>
+</section>
+
+<!-- ═══════ ISSUES ═══════ -->
+<section id="issues">
+  <h2>Сквозные проблемы (v2)</h2>
+
+  <h3>Остались из v1</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема</th><th>Приоритет</th><th>Действие</th></tr></thead>
+      <tbody>
+        <tr><td><code>_get_job_status_sync()</code> течёт соединениями к БД</td><td><span class="tag tag-red">Высокий</span></td><td>Починить до новых акторов</td></tr>
+        <tr><td><code>tasks/service.py</code> &mdash; 1 674 строки</td><td><span class="tag tag-yellow">Средний</span></td><td>Вынести бойлерплейт акторов</td></tr>
+        <tr><td><code>REMOTION_SERVICE_URL</code> default неверный</td><td><span class="tag tag-yellow">Средний</span></td><td>Исправить на <code>http://remotion:3001</code></td></tr>
+        <tr><td>Нет лимитов ресурсов на Docker-сервисах</td><td><span class="tag tag-yellow">Средний</span></td><td>Добавить memory/CPU лимиты</td></tr>
+        <tr><td>Нет очистки /tmp при OOM</td><td><span class="tag tag-yellow">Средний</span></td><td>Периодическая очистка / cron</td></tr>
+        <tr><td><code>isCurrent</code> в Captions.tsx хрупкая</td><td><span class="tag tag-blue">Низкий</span></td><td>Сравнивать по индексу</td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Новые в v2</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема</th><th>Приоритет</th><th>Действие</th></tr></thead>
+      <tbody>
+        <tr><td>Управление API-ключами (3 сервиса)</td><td><span class="tag tag-red">Высокий</span></td><td>Всё через env-переменные, никогда в коде</td></tr>
+        <tr><td>Rate limit'ы API</td><td><span class="tag tag-red">Высокий</span></td><td>Retry + exponential backoff</td></tr>
+        <tr><td>Vendor lock-in</td><td><span class="tag tag-yellow">Средний</span></td><td>Интерфейс движков (существующий паттерн)</td></tr>
+        <tr><td>Зависимость от сети</td><td><span class="tag tag-yellow">Средний</span></td><td>Whisper как фоллбэк</td></tr>
+        <tr><td>Deepgram &rarr; Document конвертация</td><td><span class="tag tag-yellow">Средний</span></td><td>Конвертер под существующую схему</td></tr>
+        <tr><td>GigaChat OAuth2 token refresh</td><td><span class="tag tag-yellow">Средний</span></td><td>Кэш + авто-обновление в <code>infrastructure/</code></td></tr>
+      </tbody>
+    </table>
+  </div>
+
+  <h3>Ликвидированные из v1</h3>
+  <div class="table-wrap">
+    <table>
+      <thead><tr><th>Проблема</th><th>Почему исчезла</th></tr></thead>
+      <tbody>
+        <tr><td><del>PyTorch CPU-only индекс</del></td><td>PyTorch удалён полностью</td></tr>
+        <tr><td><del>OOM воркера на ML-джобах</del></td><td>Нет тяжёлого ML локально</td></tr>
+        <tr><td><del>Docker-образ ML-воркера</del></td><td>Один лёгкий образ</td></tr>
+        <tr><td><del>GPU-инфраструктура</del></td><td>Весь ML через API</td></tr>
+        <tr><td><del>Конфликты версий PyTorch</del></td><td>Нет PyTorch</td></tr>
+        <tr><td><del>Скачивание моделей при первом запуске</del></td><td>Нет локальных моделей</td></tr>
+      </tbody>
+    </table>
+  </div>
+</section>
+
+</div><!-- /.container -->
+
+<footer>
+  <p>Coffee Project &middot; Техническая консультация v2 &middot; 22 марта 2026</p>
+  <p style="margin-top: .5rem;">Отчёты специалистов доступны в стенограмме сессии</p>
+</footer>
+
+</body>
+</html>
diff --git a/docs/consults/video-features-roadmap_v2_ru.md b/docs/consults/video-features-roadmap_v2_ru.md
new file mode 100644
index 0000000..e8c8038
--- /dev/null
+++ b/docs/consults/video-features-roadmap_v2_ru.md
@@ -0,0 +1,537 @@
+# Дорожная карта видеофич — Техническая консультация v2 (API-First)
+
+**Дата:** 22 марта 2026
+**Консультанты:** ML/AI-инженер, Backend-архитектор, Remotion-инженер, Frontend-архитектор, DevOps-инженер, Инженер по производительности
+**Ревизия:** v2 — переход на API-first архитектуру: Deepgram, GigaChat, DeepInfra
+
+---
+
+## Что изменилось по сравнению с v1
+
+Одно принципиальное решение перевернуло всю архитектуру: вместо локальных ML-моделей — управляемые API-сервисы. PyTorch, GPU-инфраструктура, разделение ML-воркеров, большинство проблем с памятью и временем обработки — всё это просто исчезло.
+
+### Замены API
+
+| v1 (локальный ML) | v2 (API-First) | Эффект |
+|---|---|---|
+| Локальный Whisper (PyTorch, 20-60 мин на CPU) | **Deepgram Nova-3** API (~30 сек) | PyTorch больше не нужен вообще |
+| Локальный pyannote.audio (PyTorch, 15-30 мин CPU) | **Deepgram** `diarize=true` (включено в вызов транскрипции) | pyannote + torchaudio — удалены |
+| Gemini 2.5 Flash / GPT-4o-mini для вирусной детекции | **GigaChat Pro** (нативная русскоязычная LLM от Сбера) | Лучше понимает русский юмор, культурный контекст, сленг |
+| librosa для анализа энергии аудио | **Deepgram** `sentiment=true` (посегментно) | Сентимент заменяет анализ энергии в большинстве сценариев |
+| Нет | **DeepInfra** (Llama, Mistral, Qwen через API) | Фоллбэк / A/B-тестирование LLM-анализа |
+
+### Как изменились ключевые метрики
+
+Цифры говорят сами за себя:
+
+| Метрика | v1 | v2 | Изменение |
+|---|---|---|---|
+| Размер Docker-образа | 1.72 ГБ | **~400-500 МБ** | -75% (нет PyTorch) |
+| Пиковое потребление RAM воркера | 8-16 ГБ | **~400 МБ** (только MediaPipe) | -95% |
+| Время обработки (30-мин видео, полный пайплайн) | 35-80 мин (CPU) | **5-10 мин** | -85% |
+| Стоимость за видео | $0.11 | **$0.20** | +80% (стоимость API) |
+| Месячная стоимость (100 видео) | $11 вычисления + сервер + $0-380 GPU | **$20 API + сервер** | Проще, дешевле при малых объёмах |
+| Нужен GPU? | Фаза 2 для диаризации | **Никогда** | Полностью исключён |
+| Новые Python-зависимости | ~310-340 МБ | **~40 МБ** (mediapipe + HTTP-клиенты) | -88% |
+| Общий срок всех MVP | 26-34 dev-дня | **20-27 dev-дней** | -20-25% |
+
+Образ в четыре раза легче. RAM в сорок раз меньше. Обработка в шесть-восемь раз быстрее. Цена за единицу чуть выше, но инфраструктурные расходы сжимаются до нуля. Неплохой обмен.
+
+### Проблемы, которых больше нет
+
+Эти сквозные проблемы из v1 перестали существовать:
+
+| Проблема из v1 | Почему исчезла |
+|---|---|
+| ~~Переключить PyTorch на CPU-only индекс~~ | PyTorch удалён полностью (Whisper заменён Deepgram) |
+| ~~OOM воркера на параллельных ML-джобах~~ | Нет тяжёлого ML — стандартный воркер на 4 ГБ |
+| ~~Отдельный Docker-образ для ML-воркера~~ | Один лёгкий образ |
+| ~~Планирование GPU-инфраструктуры~~ | Весь ML — через API |
+| ~~Конфликты версий PyTorch~~ | Нет PyTorch |
+| ~~Скачивание моделей при первом запуске~~ | Нет локальных моделей (кроме MediaPipe, ~2 МБ) |
+| ~~Разделение ML через Docker Compose profiles~~ | Не нужно |
+
+### Новые проблемы
+
+Ничего бесплатного не бывает. Вот что появилось взамен:
+
+| Проблема | Приоритет | Митигация |
+|---|---|---|
+| Управление API-ключами (Deepgram, GigaChat, DeepInfra) | Высокий | Хранить в settings через env-переменные, никогда в коде |
+| Rate limit'ы API | Высокий | Retry с exponential backoff в акторах |
+| Зависимость от вендора (vendor lock-in) | Средний | Абстрагировать за интерфейсами движков (как уже сделано: `engine: "whisper" \| "google"`) |
+| Зависимость от сети (API упал = обработка встала) | Средний | Оставить Whisper как опциональный фоллбэк-движок |
+| Более высокая стоимость за видео ($0.20 vs $0.11) | Низкий | Компенсируется нулевыми инфраструктурными расходами; прибыльно на любом SaaS-тарифе |
+
+---
+
+## Общая картина
+
+| # | Фича | Сложность | MVP | Полная версия | Доп. инфраструктура |
+|---|-------|-----------|-----|---------------|---------------------|
+| 1 | Продвинутые шаблоны Remotion | Легко-средне | 3-4 дня | 3-4 дня | Ничего — можно начинать хоть сейчас |
+| 2 | Детекция вирусных моментов | Средне | **3-5 дней** | 6-10 дней | API-ключи (GigaChat, Deepgram) |
+| 3 | Авто-монтаж и трекинг лица | Сложно | **8-10 дней** | 20-30 дней | Только MediaPipe (CPU, ~30 МБ) |
+| 4 | Конвертация в вертикальные Shorts (9:16) | Средне | 6-8 дней | +3-4 дня после #3 | Ничего |
+| **Итого** | | | **20-27 дней** | **35-47 дней** | |
+
+Реалистичный прогноз для одного разработчика: **5-7 недель** (все MVP) или **2-3 месяца** (полные версии).
+
+---
+
+## Фича 1: Продвинутые шаблоны Remotion
+
+**Без изменений по сравнению с v1.** Эта фича не зависит от ML.
+
+**Статус:** Спецификация и план реализации уже написаны. Бери и делай.
+
+- Спецификация: `docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md`
+- План: `docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md`
+
+**Что делаем:** Расширяем `CaptionStyleSchema` четырьмя новыми стилями подсветки (pop_in, karaoke, bounce, glow_pulse), двумя переходами (zoom_in, drop_in), тремя полями (word_entrance, highlight_rotation_deg, text_transform). Два системных пресета: "Shorts" и "Podcast".
+
+**Где трогаем код:** Расширение схемы в Remotion + бэкенде, логика рендеринга в `Captions.tsx`, Alembic-миграция для пресетов, контролы в StyleEditor на фронте.
+
+---
+
+## Фича 2: Детекция вирусных моментов
+
+### Архитектура (v2 — API-First)
+
+В v1 мы планировали гонять LLM по тексту и считать энергию аудио через librosa. В v2 подход элегантнее: один вызов Deepgram даёт транскрипцию, разметку спикеров и сентимент-анализ — три результата за одну цену. А текст анализирует GigaChat, которому русский язык родной.
+
+**Транскрипция:** Deepgram Nova-3 API с `diarize=true` + `sentiment=true`. Один API-вызов возвращает пословные таймстемпы, метки спикеров и оценку сентимента по каждому высказыванию. Стоимость: $0.0053/мин ($0.16 за 30-минутное видео). Обработка: ~30 секунд.
+
+**LLM-анализ:** GigaChat Pro (от Сбера) — нативная русскоязычная LLM, обученная на русском интернете. Лучше ловит русский юмор, культурные отсылки, сленг и вирусные паттерны, чем модели, для которых русский — второй язык. Фоллбэк: DeepInfra (Llama 3.1 70B или Qwen) для A/B-тестирования.
+
+**Аудио-подкрепление:** Посегментные оценки сентимента от Deepgram заменяют анализ энергии через `librosa` в большинстве случаев. Высокий сентимент коррелирует с вирусными моментами. Опционально: librosa для анализа громкости (смех, повышенный голос) как дополнение.
+
+**Пайплайн:**
+1. Транскрипция Deepgram с `diarize=true` + `sentiment=true` -> таймстемпы + спикеры + сентимент
+2. Конвертация ответа Deepgram в существующую схему `Document` (segments, lines, words)
+3. GigaChat анализирует текст транскрипции + данные сентимента -> кандидаты в вирусные клипы
+4. Постобработка: привязка границ к краям сегментов, вычисление композитных скоров
+5. Сохранение клипов в таблицу `clips`
+
+### Бэкенд
+
+**Новый модуль:** `clips` (models, schemas, repository, service, router) — хранит найденные клипы со связями project/file/job.
+
+**Модель клипа:**
+```
+Clip {
+  project_id: UUID (FK projects)
+  source_file_id: UUID (FK files)
+  job_id: UUID? (FK jobs)
+  title: str
+  start_ms: int
+  end_ms: int
+  score: float
+  source_type: "viral_detected" | "user_created" | "auto_generated"
+  status: "pending" | "approved" | "rejected" | "exported"
+  meta: JSON? (рассуждения LLM, теги, хэштеги, данные сентимента)
+}
+```
+
+**Новый тип джоба:** `VIRAL_DETECT` в `JobTypeEnum`. Актор вызывает GigaChat API через `httpx` из Dramatiq-воркера.
+
+**Расширение движков транскрипции:** Добавляем `"deepgram"` к существующему выбору (`engine: "whisper" | "google" | "deepgram"`). Deepgram становится дефолтом для новых транскрипций. Whisper остаётся как фоллбэк.
+
+**Интеграция с LLM:**
+- GigaChat API через `httpx` (OAuth2 token auth через Sber ID)
+- DeepInfra как фоллбэк (OpenAI-совместимый API)
+- Промпты хранятся в `cpv3/infrastructure/prompts/viral_detection_v1.txt`
+- Активная версия контролируется через `LLM_VIRAL_PROMPT_VERSION`
+- Новые настройки: `GIGACHAT_CLIENT_ID`, `GIGACHAT_CLIENT_SECRET`, `DEEPINFRA_API_KEY`, `DEEPGRAM_API_KEY`
+
+### Фронтенд
+
+- Новый `ViralClipsStep` в визарде проекта (features/project/)
+- Список клипов с превьюшками, скорами, заголовками, кнопками принять/отклонить
+- Модалка редактирования клипа с видео-превью (воспроизведение в диапазоне start/end)
+- Новый тип джоба `VIRAL_DETECT` в обработке нотификаций (через существующую WebSocket-инфраструктуру)
+
+### Ключевые цифры
+
+| Метрика | v1 | v2 |
+|---|---|---|
+| Время транскрипции | Зависит от Whisper (уже готово) | ~30 сек (Deepgram, если ещё не транскрибировано) |
+| Время LLM-анализа | 10-20 сек | 10-20 сек (без изменений) |
+| Общее время обработки | 10-20 сек (после транскрипции) | **40-50 сек** (включая транскрипцию Deepgram) |
+| Стоимость за видео | ~$0.005 (только LLM) | **~$0.17** ($0.16 Deepgram + $0.01 GigaChat) |
+| Точность (precision) | 50-70% | **60-80%** (GigaChat лучше на русском + данные сентимента) |
+| Новые зависимости | `google-generativeai` + `librosa` (~30 МБ) | **Только HTTP-клиент** (~0 МБ нового) |
+| Срок MVP | 5-7 дней | **3-5 дней** |
+
+Стоимость за видео выросла с полкопейки до семнадцати центов — но зато ноль зависимостей, ноль локальных моделей и на два дня быстрее в разработке.
+
+### Риски
+
+- **Доступность GigaChat API** — у Сбера uptime может быть ниже, чем у Google/OpenAI. Митигация: фоллбэк на DeepInfra.
+- **Structured output GigaChat** — проверить, что JSON mode / function calling работает стабильно для извлечения клипов. Тестировать рано.
+- **WER Deepgram на русском** — ~10-12% WER (Nova-3). Сопоставимо с Whisper `medium`. Достаточно для вирусной детекции.
+- **Визуальные моменты** по-прежнему не ловятся (~20-30%) — то же ограничение, что и в v1.
+
+### MVP vs Полная версия
+
+- **MVP (3-5 дней):** Транскрипция Deepgram + анализ GigaChat. Возвращает клипы со скорами. Пользователь ревьюит и принимает/отклоняет. Без анализа энергии аудио.
+- **Полная (6-10 дней):** Добавляем скоринг с весами сентимента, few-shot тюнинг промпта по фидбеку пользователей, пакетную обработку, прямой экспорт клипов в 9:16, A/B-тестирование через DeepInfra.
+
+---
+
+## Фича 3: Авто-монтаж и трекинг лица
+
+В v1 эта фича была монстром: pyannote на CPU 30 минут жуёт аудио, PyTorch конфликтует с Whisper, GPU-воркеры, 16 ГБ RAM. В v2 всё, что касалось диаризации, ушло в один API-вызов Deepgram. Осталась только детекция лиц через MediaPipe — лёгкая библиотека, работающая на CPU за минуту-две.
+
+### Архитектура (v2 — API-First)
+
+**Детекция лиц:** MediaPipe BlazeFace (без изменений). Apache 2.0, модель ~2МБ, 30-60 FPS на CPU. Сэмплируем на 3 FPS. **Это единственный оставшийся локальный ML-компонент.** Зависимость: `mediapipe` (~30МБ).
+
+**Диаризация спикеров:** **Deepgram API** с `diarize=true` (~30 секунд на 30-минутное видео). Полностью заменяет pyannote.audio. Диаризация включена в вызов транскрипции — никаких дополнительных расходов на API.
+
+**Маппинг лицо-спикер:**
+- **Фаза 1:** Эвристика по временнОй корреляции — сопоставляем треки лиц с сегментами спикеров Deepgram по максимальному пересечению во времени. 70-85% точности для видео с двумя спикерами. Ноль дополнительных зависимостей. ~100 строк Python.
+- **Фаза 2:** TalkNet-ASD — если понадобится повысить точность. Единственный сценарий, где GPU мог бы вернуться в повестку, но можно откладывать бесконечно, если временная корреляция + ручная коррекция пользователем достаточны.
+
+**Видео-композитинг:** Как в v1 — Remotion-композиции с CSS transform crop. Без изменений.
+
+**Новые Remotion-композиции:** Как в v1.
+
+| Композиция | Назначение | Фаза |
+|---|---|---|
+| `CaptionedVideo` (существует) | Наложение субтитров на нативное видео | Текущая |
+| `ShortsVideo` (новая) | Статический/ключевой кроп + субтитры в 9:16 | Фича 4 |
+| `AutoEditVideo` (новая) | Кроп с трекингом лица + монтаж + субтитры | Фича 3 (полная) |
+
+**Формат данных кропа:** Как в v1 (ключевые кадры с нормализованными координатами 0-1).
+
+### Бэкенд
+
+**Новые типы джобов:** `FACE_DETECT` в `JobTypeEnum`. `SPEAKER_DIARIZE` **больше не нужен как отдельный джоб** — диаризация приходит от Deepgram в составе транскрипции.
+
+**Отделение ML-сервиса:** **Не требуется.** MediaPipe лёгкая (~30МБ, ~400МБ RAM). Работает в стандартном Dramatiq-воркере.
+
+**Изменения в Remotion-сервисе:** Как в v1 — параметр `compositionId`, пропсы `crop`/`outputWidth`/`outputHeight`.
+
+### Время обработки (30-минутное 1080p видео)
+
+Вот где разница между v1 и v2 ощущается физически:
+
+| Шаг | v1 (CPU) | v2 (API-First) |
+|---|---|---|
+| Транскрипция + диаризация Deepgram | Нет | **~30 сек** |
+| Детекция лиц (MediaPipe, 3 FPS) | 1-2 мин | 1-2 мин (без изменений) |
+| ~~Диаризация спикеров (pyannote)~~ | ~~15-30 мин~~ | **Включено в Deepgram** |
+| Маппинг лицо-спикер | < 1 сек | < 1 сек |
+| Рендер Remotion (кроп + субтитры) | 10-30 мин | 10-30 мин (без изменений) |
+| **Итого (с параллелизацией)** | **35-80 мин** | **12-33 мин** |
+
+**Бутылочное горлышко в 15-30 минут на диаризацию — полностью ликвидировано.**
+
+### Требования к памяти
+
+| Конфигурация | v1 | v2 |
+|---|---|---|
+| Пиковое потребление RAM | 8-16 ГБ | **~400 МБ** (только MediaPipe) |
+| Конфигурация воркера | `--threads 1`, лимит 16 ГБ | Стандартный воркер, лимит 4 ГБ |
+
+С 16 гигабайт до 400 мегабайт. В сорок раз. Это не оптимизация — это другая категория задач.
+
+### Фронтенд
+
+Без изменений по сравнению с v1:
+- Превью трекинга лица: видеоплеер с наложением bounding box через canvas
+- Трек спикеров в TimelinePanel
+- Контролы: слайдер зума, скорость перехода, выбор спикера
+- Переключатель сравнения «до/после»
+
+### Ключевые цифры
+
+| Метрика | v1 | v2 |
+|---|---|---|
+| Время диаризации | 15-30 мин (CPU) / 1-2 мин (GPU) | **~30 сек** (API) |
+| Время детекции лиц | 1-2 мин | 1-2 мин (без изменений) |
+| Общее время анализа | 17-33 мин (CPU) | **~2 мин** |
+| Полный пайплайн (с рендером) | 35-80 мин (CPU) | **12-33 мин** |
+| Пиковое потребление RAM | 8-16 ГБ | **~400 МБ** |
+| Новые зависимости | ~280 МБ (mediapipe + pyannote + torchaudio) | **~30 МБ** (только mediapipe) |
+| GPU нужен? | Фаза 2 — рекомендуется | **Никогда** |
+| Срок MVP | 12-15 дней | **8-10 дней** |
+
+### Риски
+
+- **Точность маппинга лицо-спикер** не изменилась (70-85% с эвристикой) — по-прежнему самая сложная подзадача
+- **Точность диаризации Deepgram** — DER может быть чуть хуже, чем у pyannote 3.1 (~12-15% vs ~10%). Приемлемо для нашего кейса.
+- **Потеря качества при кропе** — без изменений
+- **TalkNet-ASD отложен** — если временная корреляция недостаточно точна, TalkNet потребует GPU. Разберёмся, когда дойдём.
+
+### MVP vs Полная версия
+
+- **MVP (8-10 дней):** Детекция лиц на сэмплированных кадрах. Deepgram предоставляет метки спикеров. Временная корреляция привязывает лица к спикерам. Пользователь может поправить вручную. Статический кроп на выбранное лицо.
+- **Полная (20-30 дней):** Динамический кроп, следующий за активным спикером. Плавные переходы. Сплит-скрин. Мульти-спикер. Опциональный TalkNet-ASD для повышения точности.
+
+---
+
+## Фича 4: Конвертация в вертикальные Shorts (9:16)
+
+**Без изменений по сравнению с v1.** Эта фича не зависит от ML.
+
+### Архитектура
+
+**Пайплайн:** Сначала кроп, потом субтитры — всегда. Один проход рендеринга в Remotion через новую композицию `ShortsVideo`.
+
+**Позиционирование субтитров:** Новые поля в схеме не нужны. Бэкенд корректирует `font_size`, `padding_px`, `max_width_pct` в `styleConfig` под 9:16.
+
+**Спецификация кропа:**
+```typescript
+type CropConfig = {
+  mode: "static" | "keyframe";
+  staticCrop?: { x: number; y: number; zoom: number };
+  keyframes?: Array<{ time: number; x: number; y: number; zoom: number }>;
+  interpolation?: "linear" | "ease" | "smooth";
+};
+```
+
+### Бэкенд
+
+**Новый тип джоба:** `ASPECT_CONVERT` в `JobTypeEnum`. Новая функция `crop_to_vertical()` в `media/service.py`.
+
+**Новый тип артефакта:** `VERTICAL_VIDEO` в `ArtifactTypeEnum`.
+
+### Фронтенд
+
+- Превью кропа: перетаскиваемый прямоугольник 9:16 поверх видеоплеера
+- Переключатель side-by-side превью
+- Кнопка «Конвертировать в Short» на каждом одобренном вирусном клипе
+- Автозаполнение кропа из данных детекции лица (когда доступны)
+
+### Время обработки
+
+| Подход | Время (30-мин видео) |
+|---|---|
+| FFmpeg кроп (без субтитров) | 12-36 мин |
+| Remotion кроп + субтитры (один проход) | 11-45 мин |
+| FFmpeg с NVENC (аппаратное кодирование) | 3-5 мин |
+
+### MVP vs Полная версия
+
+- **MVP (6-8 дней):** Ручной выбор региона кропа с превью. Remotion-композиция `ShortsVideo`.
+- **Полная (+3-4 дня после Фичи 3):** Авто-кроп из данных детекции лица. Конвертация в один клик. Пакетный экспорт.
+
+---
+
+## Рекомендуемый порядок разработки
+
+```
+Неделя 1-2:    Фича 1 (Шаблоны)              ████████
+Неделя 2-3:    Фича 2 (Вирусная детекция)     ██████████
+Неделя 3-5:    Фича 4 MVP (9:16 кроп)         ████████████████
+Неделя 5-10:   Фича 3 (Трекинг лица)          ██████████████████████████████
+Неделя 10-11:  Фича 4 (апгрейд)               ████████
+```
+
+**Почему именно так:**
+1. **Шаблоны первыми** — готовы к реализации, нулевой риск, моментальная польза для пользователей
+2. **Вирусная детекция второй** — самый быстрый ROI с API-first подходом (3-5 дней MVP), валидирует спрос
+3. **9:16 MVP третьим** — создаёт композицию `ShortsVideo`, полезна сама по себе
+4. **Трекинг лица последним** — всё ещё самая сложная фича, но теперь намного проще без pyannote/GPU
+5. **Апгрейд 9:16** — тривиален, когда трекинг лица даёт данные о позициях
+
+---
+
+## Анализ стоимости
+
+### Стоимость обработки одного видео (30-мин, все фичи)
+
+| Компонент | v1 (локальный ML) | v2 (API-First) |
+|---|---|---|
+| Транскрипция + диаризация | $0.07 вычисления | **$0.16** (Deepgram) |
+| LLM вирусная детекция | $0.005 (Gemini) | **$0.01** (GigaChat) |
+| Детекция лиц | $0.002 вычисления | $0.002 вычисления (без изменений) |
+| FFmpeg/Remotion рендер | $0.02 вычисления | $0.02 вычисления |
+| **Итого за видео** | **$0.11** | **$0.20** |
+
+Двадцать центов. За полный пайплайн: транскрипция, диаризация, поиск вирусных моментов, детекция лиц, рендер. Двадцать центов.
+
+### Сравнение месячных расходов
+
+| Масштаб | v1 (локальный ML) | v2 (API-First) |
+|---|---|---|
+| 100 видео/месяц | $11 вычисления + сервер + $0-380 GPU | **$20 API + сервер** |
+| 500 видео/месяц | $55 + $200-380 GPU = $255-435 | **$100 API + сервер** |
+| 1 000 видео/месяц | $110 + $380 GPU = $490 | **$200 API + сервер** |
+| 5 000 видео/месяц | $550 + $380 GPU = $930 | **$1 000 API + сервер** |
+
+**Точка безубыточности:** ~2 000-3 000 видео/месяц. Ниже этого порога API дешевле. А учитывая, что не нужно думать о GPU, обслуживать ML-инфраструктуру и чинить OOM-крэши — реальный breakeven ещё выше.
+
+### Предлагаемые тарифы SaaS
+
+| Тариф | Цена | Ограничения | Себестоимость | Маржа |
+|---|---|---|---|---|
+| Free | $0 | Видео до 10 мин, 5/месяц | ~$0.07/видео | Маркетинг |
+| Pro | $15-30/мес | Видео до 30 мин, 50/месяц | ~$0.20/видео | 50-70% |
+| Business | $50-100/мес | Видео до 60 мин, 200/месяц | ~$0.35/видео | 65-80% |
+
+---
+
+## Инфраструктура (v2 — упрощённая)
+
+### Архитектура
+
+```
+Фронтенд → Backend API → Dramatiq-воркер (лёгкий: только MediaPipe)
+                              ↕              ↕           ↕
+                         PostgreSQL     Deepgram API   GigaChat API
+                         Redis          (транскрипция   (вирусная
+                         S3/MinIO        + диаризация)   детекция)
+                         Remotion        DeepInfra
+                                         (фоллбэк LLM)
+```
+
+Сравните с v1: нет ML-воркера, нет GPU, нет Docker Compose profiles. Один воркер обрабатывает всё.
+
+### Docker-образ
+
+| | v1 | v2 |
+|---|---|---|
+| База | python:3.11-slim + PyTorch + Whisper + CUDA libs | python:3.11-slim + mediapipe |
+| Размер | 1.72 ГБ | **~400-500 МБ** |
+| RAM | 16 ГБ рекомендовано | **4 ГБ достаточно** |
+
+**Можно удалить из `pyproject.toml`:** `openai-whisper` (и транзитивно PyTorch) — если Deepgram полностью заменяет Whisper. Оставить Whisper как опциональную зависимость (`uv sync --group whisper`) для фоллбэка.
+
+### Разделение ML-сервиса не требуется
+
+При том, что локально работает только MediaPipe (~30МБ, ~400МБ RAM), не нужны:
+- Отдельный контейнер ML-воркера
+- Docker Compose profiles для ML
+- GPU-инфраструктура
+- Выделенные очереди Dramatiq для ML
+
+Стандартный воркер с `--processes 1 --threads 2` справляется со всем.
+
+### Новые настройки
+
+```python
+# Deepgram
+deepgram_api_key: str = Field(default="", alias="DEEPGRAM_API_KEY")
+
+# GigaChat (Сбер)
+gigachat_client_id: str = Field(default="", alias="GIGACHAT_CLIENT_ID")
+gigachat_client_secret: str = Field(default="", alias="GIGACHAT_CLIENT_SECRET")
+
+# DeepInfra (фоллбэк LLM)
+deepinfra_api_key: str = Field(default="", alias="DEEPINFRA_API_KEY")
+
+# Конфигурация LLM
+llm_provider: str = Field(default="gigachat", alias="LLM_PROVIDER")  # gigachat | deepinfra
+llm_viral_prompt_version: str = Field(default="v1", alias="LLM_VIRAL_PROMPT_VERSION")
+```
+
+---
+
+## Сводка по технологическому стеку
+
+### Новые зависимости (v2)
+
+| Пакет | Размер | Назначение | Фича |
+|---|---|---|---|
+| `mediapipe` | ~30 МБ | Детекция лиц (CPU) | 3 |
+| `httpx` | Уже установлен | API-вызовы к Deepgram, GigaChat, DeepInfra | 2, 3 |
+| **Итого новых зависимостей** | **~30 МБ** | | |
+
+### Удалённые зависимости (по сравнению с v1)
+
+| Пакет | Сэкономлено | Для чего использовался |
+|---|---|---|
+| ~~`openai-whisper`~~ | ~50 МБ + PyTorch ~2 ГБ | Транскрипция (заменён Deepgram) |
+| ~~`pyannote-audio`~~ | ~200 МБ | Диаризация (заменён Deepgram) |
+| ~~`torchaudio`~~ | ~50-80 МБ | Зависимость pyannote |
+| ~~`librosa`~~ | ~20 МБ | Энергия аудио (заменён сентиментом Deepgram) |
+| **Итого удалено** | **~2.3 ГБ** | |
+
+Добавили 30 мегабайт. Удалили 2.3 гигабайта. Соотношение 1:77.
+
+### Новые бэкенд-модули
+
+| Модуль | Назначение | Фича |
+|---|---|---|
+| `clips` | CRUD клипов, воркфлоу ревью | 2 |
+
+### Новые Remotion-композиции
+
+| Композиция | Назначение | Фича |
+|---|---|---|
+| `ShortsVideo` | Статический/ключевой кроп + субтитры в 9:16 | 4 |
+| `AutoEditVideo` | Динамический кроп с трекингом лица + субтитры | 3 |
+
+### Новые типы джобов
+
+| Тип джоба | Назначение | Фича |
+|---|---|---|
+| `VIRAL_DETECT` | Анализ транскрипции через GigaChat | 2 |
+| `ASPECT_CONVERT` | 9:16 кроп + пере-кодирование | 4 |
+| `FACE_DETECT` | Детекция bounding box лиц (MediaPipe) | 3 |
+
+Обратите внимание: `SPEAKER_DIARIZE` **больше не является отдельным типом джоба** — диаризация включена в транскрипцию Deepgram.
+
+### Расширение движков транскрипции
+
+```python
+# Расширяем существующий выбор движка:
+engine: Literal["whisper", "google", "deepgram"] = "deepgram"
+```
+
+Deepgram становится дефолтом. Whisper остаётся как опциональный фоллбэк (требует `uv sync --group whisper`).
+
+---
+
+## Сквозные проблемы (v2)
+
+### Остались из v1
+
+| Проблема | Приоритет | Действие |
+|---|---|---|
+| `_get_job_status_sync()` течёт соединениями к БД | Высокий | Починить до добавления новых акторов |
+| `tasks/service.py` — 1 674 строки, скоро перевалит за 2К | Средний | Вынести бойлерплейт акторов |
+| Дефолт `REMOTION_SERVICE_URL` в воркере неверный | Средний | Исправить на `http://remotion:3001` |
+| Ни на одном Docker-сервисе нет лимитов ресурсов | Средний | Добавить memory/CPU лимиты |
+| Нет очистки временных файлов при OOM-крэше | Средний | Добавить периодическую очистку |
+| Проверка `isCurrent` слова в Captions.tsx хрупкая | Низкий | Сравнивать по индексу |
+
+### Новые в v2
+
+| Проблема | Приоритет | Действие |
+|---|---|---|
+| Управление API-ключами (3 сервиса) | Высокий | Всё через env-переменные в settings, никогда в коде |
+| Обработка rate limit'ов API | Высокий | Retry с exponential backoff во всех акторах |
+| Зависимость от вендора | Средний | Абстрагировать за интерфейсом движков (существующий паттерн) |
+| Зависимость от сети (API недоступен) | Средний | Оставить Whisper как опциональный фоллбэк |
+| Конвертация Deepgram -> схема Document | Средний | Построить конвертер под существующую структуру `Document` |
+| Обновление OAuth2-токена GigaChat | Средний | Кэширование токена с авто-обновлением в `infrastructure/` |
+
+### Ликвидированные из v1
+
+| ~~Проблема~~ | Почему исчезла |
+|---|---|
+| ~~PyTorch CPU-only индекс~~ | PyTorch удалён полностью |
+| ~~OOM воркера на ML-джобах~~ | Нет тяжёлого ML локально |
+| ~~Docker-образ ML-воркера~~ | Один лёгкий образ |
+| ~~GPU-инфраструктура~~ | Весь ML — через API |
+| ~~Конфликты версий PyTorch~~ | Нет PyTorch |
+| ~~Скачивание моделей при первом запуске~~ | Нет локальных моделей |
+
+---
+
+## Отчёты специалистов
+
+Полные выводы специалистов доступны в стенограмме сессии. Ключевые файлы, которые изучал каждый:
+
+- **ML-инженер:** `cpv3/modules/transcription/service.py`, `cpv3/modules/tasks/service.py`, `pyproject.toml`
+- **Backend-архитектор:** `cpv3/modules/tasks/service.py`, `cpv3/modules/jobs/schemas.py`, `cpv3/modules/media/service.py`, `cpv3/modules/captions/service.py`, `docker-compose.yml`
+- **Remotion-инженер:** `remotion_service/src/components/Composition.tsx`, `Captions.tsx`, `Root.tsx`, `useCaptions.ts`, `useVideoMeta.ts`, все определения типов
+- **Frontend-архитектор:** `src/widgets/TimelinePanel/`, `src/features/project/FragmentsStep/`, `src/shared/context/WizardContext.tsx`, `src/shared/store/notifications/`
+- **DevOps-инженер:** `docker-compose.yml`, `Dockerfile`, `pyproject.toml`, `uv.lock`
+- **Инженер по производительности:** `cpv3/modules/tasks/service.py`, `cpv3/modules/media/service.py`, `cpv3/modules/transcription/service.py`, `docker-compose.yml`
+
+Примечание: Отчёты специалистов подготовлены для архитектуры v1 (локальный ML). Их рекомендации по Remotion-композициям, дизайну бэкенд-модулей, фронтенд-компонентам и форматам данных кропа остаются актуальными в v2. Рекомендации по инфраструктуре и ML-моделям заменены API-first подходом.
diff --git a/docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md b/docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md
new file mode 100644
index 0000000..a4e8153
--- /dev/null
+++ b/docs/superpowers/plans/2026-03-21-advanced-remotion-templates.md
@@ -0,0 +1,918 @@
+# Advanced Remotion Templates Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Extend the caption animation system with 4 new highlight styles, 2 new segment transitions, 3 new animation fields, and ship 2 system presets ("Шортс" and "Подкаст").
+
+**Architecture:** Additive changes only — extend existing `CaptionStyleSchema` with new enum values and fields, implement new animation rendering in `Captions.tsx` using Remotion primitives, seed 2 new system presets via Alembic migration, and add new form controls to the frontend StyleEditor.
+
+**Tech Stack:** Remotion (`interpolate()`, `spring()`), ElysiaJS (Elysia `t.*` validation), FastAPI + Pydantic + Alembic, Next.js + react-hook-form
+
+**Spec:** `docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md`
+
+---
+
+## File Map
+
+### Remotion Service (`remotion_service/`)
+
+| Action | File | Responsibility |
+|--------|------|----------------|
+| Modify | `server/types/CaptionStyleSchema.ts` | Add new enum values + 3 new fields to Elysia validation schema |
+| Modify | `src/types/caption_style.d.ts` | Mirror TypeScript type changes |
+| Modify | `src/components/Captions.tsx` | Implement 4 new highlight renderers, 2 new transitions, word entrance logic, rotation, text-transform |
+
+### Backend (`cofee_backend/`)
+
+| Action | File | Responsibility |
+|--------|------|----------------|
+| Modify | `cpv3/modules/captions/schemas.py` | Extend Pydantic `CaptionAnimationStyle` with new Literal values + 3 fields |
+| Create | `alembic/versions/e6f7a8b9c0d1_seed_shorts_podcast_presets.py` | Seed 2 new system presets |
+
+### Frontend (`cofee_frontend/`)
+
+| Action | File | Responsibility |
+|--------|------|----------------|
+| Modify | `src/features/project/CaptionSettingsStep/StyleEditor.tsx` | Add new select options + 3 new form fields to AnimationFields |
+
+---
+
+## Task 1: Extend Remotion Schema & Types
+
+**Files:**
+- Modify: `remotion_service/server/types/CaptionStyleSchema.ts` (lines 30-47)
+- Modify: `remotion_service/src/types/caption_style.d.ts` (lines 20-26)
+
+- [ ] **Step 1: Update Elysia validation schema**
+
+In `remotion_service/server/types/CaptionStyleSchema.ts`, replace the `CaptionAnimationStyle` object (lines 30-47) with:
+
+```typescript
+export const CaptionAnimationStyle = t.Object({
+  highlight_style: t.Union(
+    [
+      t.Literal("color"),
+      t.Literal("scale"),
+      t.Literal("underline"),
+      t.Literal("color_scale"),
+      t.Literal("pop_in"),
+      t.Literal("karaoke"),
+      t.Literal("bounce"),
+      t.Literal("glow_pulse"),
+    ],
+    { default: "color" },
+  ),
+  highlight_scale: t.Number({ default: 1.1 }),
+  segment_transition: t.Union(
+    [
+      t.Literal("fade"),
+      t.Literal("slide"),
+      t.Literal("none"),
+      t.Literal("zoom_in"),
+      t.Literal("drop_in"),
+    ],
+    { default: "fade" },
+  ),
+  fade_duration_frames: t.Number({ default: 3 }),
+  animation_speed: t.Number({ default: 1.0 }),
+  word_entrance: t.Union(
+    [t.Literal("none"), t.Literal("pop"), t.Literal("typewriter")],
+    { default: "none" },
+  ),
+  highlight_rotation_deg: t.Number({ default: 0 }),
+  text_transform: t.Union(
+    [t.Literal("none"), t.Literal("uppercase"), t.Literal("lowercase")],
+    { default: "none" },
+  ),
+});
+```
+
+- [ ] **Step 2: Update TypeScript type definitions**
+
+In `remotion_service/src/types/caption_style.d.ts`, replace the `CaptionAnimationStyle` type (lines 20-26) with:
+
+```typescript
+export type CaptionAnimationStyle = {
+  highlight_style:
+    | "color"
+    | "scale"
+    | "underline"
+    | "color_scale"
+    | "pop_in"
+    | "karaoke"
+    | "bounce"
+    | "glow_pulse";
+  highlight_scale: number;
+  segment_transition: "fade" | "slide" | "none" | "zoom_in" | "drop_in";
+  fade_duration_frames: number;
+  animation_speed: number;
+  word_entrance: "none" | "pop" | "typewriter";
+  highlight_rotation_deg: number;
+  text_transform: "none" | "uppercase" | "lowercase";
+};
+```
+
+- [ ] **Step 3: Type-check Remotion service**
+
+Run: `cd remotion_service && bunx tsc --noEmit`
+Expected: PASS — no type errors (existing code uses only the old enum values, which are still present)
+
+- [ ] **Step 4: Commit**
+
+```bash
+cd remotion_service
+git add server/types/CaptionStyleSchema.ts src/types/caption_style.d.ts
+git commit -m "feat(remotion): extend CaptionAnimationStyle schema with new highlight styles, transitions, and fields"
+```
+
+---
+
+## Task 2: Implement New Highlight Styles in Captions.tsx
+
+**Files:**
+- Modify: `remotion_service/src/components/Captions.tsx` (lines 57-136 — `StyledWord` component)
+
+This is the largest task. We modify the `StyledWord` component to handle 4 new highlight styles. The current `isCurrent` block (lines 86-133) handles `color`, `scale`, `underline`, `color_scale`. We add `pop_in`, `karaoke`, `bounce`, `glow_pulse`.
+
+- [ ] **Step 1: Add spring import**
+
+At the top of `remotion_service/src/components/Captions.tsx`, add `spring` to the Remotion import (line 2):
+
+```typescript
+import { interpolate, spring, useVideoConfig } from "remotion";
+```
+
+- [ ] **Step 2: Add `useVideoConfig` to StyledWord**
+
+Inside `StyledWord` (line 69), add `fps` extraction right after the destructure:
+
+```typescript
+const { fps } = useVideoConfig();
+```
+
+This is needed for `spring()` calls which require fps.
+
+- [ ] **Step 3: Implement `pop_in` highlight style**
+
+In `StyledWord`, inside the `if (isCurrent)` block (after the existing `underline` branch at line 132), add:
+
+```typescript
+    if (animation.highlight_style === "pop_in") {
+      const wordDuration = wordFrameTime.end - wordFrameTime.start;
+      const scale =
+        wordDuration > MIN_INTERPOLATE_SPAN * 2
+          ? spring({
+              fps,
+              frame: currentFrame - wordFrameTime.start,
+              config: { damping: 12, stiffness: 200 },
+              durationInFrames: Math.min(Math.ceil(wordDuration / 2), 15),
+            })
+          : 1;
+      const finalScale = interpolate(scale, [0, 1], [0, animation.highlight_scale]);
+      baseStyle.transform = `scale(${finalScale})`;
+    }
+```
+
+- [ ] **Step 4: Implement `karaoke` highlight style**
+
+After the `pop_in` block, add:
+
+```typescript
+    if (animation.highlight_style === "karaoke") {
+      const wordDuration = wordFrameTime.end - wordFrameTime.start;
+      const progress =
+        wordDuration > MIN_INTERPOLATE_SPAN
+          ? interpolate(
+              currentFrame,
+              [wordFrameTime.start, wordFrameTime.end],
+              [0, 100],
+              { extrapolateLeft: "clamp", extrapolateRight: "clamp" },
+            )
+          : 100;
+      baseStyle.background = `linear-gradient(to right, ${textStyle.highlight_color} ${progress}%, ${textStyle.text_color} ${progress}%)`;
+      baseStyle.WebkitBackgroundClip = "text";
+      baseStyle.WebkitTextFillColor = "transparent";
+      baseStyle.backgroundClip = "text";
+      // Override the color set above — karaoke uses gradient instead
+      baseStyle.color = undefined;
+    }
+```
+
+- [ ] **Step 5: Implement `bounce` highlight style**
+
+After the `karaoke` block, add:
+
+```typescript
+    if (animation.highlight_style === "bounce") {
+      const wordDuration = wordFrameTime.end - wordFrameTime.start;
+      const scale =
+        wordDuration > MIN_INTERPOLATE_SPAN * 2
+          ? spring({
+              fps,
+              frame: currentFrame - wordFrameTime.start,
+              config: { damping: 8, stiffness: 180 },
+              durationInFrames: Math.min(Math.ceil(wordDuration), 20),
+            })
+          : 1;
+      const finalScale = interpolate(scale, [0, 1], [1, animation.highlight_scale]);
+      baseStyle.transform = `scale(${finalScale})`;
+    }
+```
+
+- [ ] **Step 6: Implement `glow_pulse` highlight style**
+
+After the `bounce` block, add:
+
+```typescript
+    if (animation.highlight_style === "glow_pulse") {
+      const wordDuration = wordFrameTime.end - wordFrameTime.start;
+      const pulse =
+        wordDuration > MIN_INTERPOLATE_SPAN * 2
+          ? interpolate(
+              currentFrame,
+              [
+                wordFrameTime.start,
+                wordFrameTime.start + wordDuration * 0.25,
+                wordFrameTime.start + wordDuration * 0.5,
+                wordFrameTime.start + wordDuration * 0.75,
+                wordFrameTime.end,
+              ],
+              [4, 12, 4, 12, 4],
+              { extrapolateLeft: "clamp", extrapolateRight: "clamp" },
+            )
+          : 8;
+      baseStyle.textShadow = `0 0 ${pulse}px ${textStyle.highlight_color}, 0 0 ${pulse * 2}px ${textStyle.highlight_color}`;
+    }
+```
+
+- [ ] **Step 7: Add `highlight_rotation_deg` support**
+
+After all highlight style branches (still inside `if (isCurrent)`), add rotation support:
+
+```typescript
+    if (animation.highlight_rotation_deg > 0) {
+      const existingTransform = baseStyle.transform || "";
+      baseStyle.transform = `${existingTransform} rotate(${animation.highlight_rotation_deg}deg)`.trim();
+    }
+```
+
+- [ ] **Step 8: Type-check**
+
+Run: `cd remotion_service && bunx tsc --noEmit`
+Expected: PASS
+
+- [ ] **Step 9: Commit**
+
+```bash
+cd remotion_service
+git add src/components/Captions.tsx
+git commit -m "feat(remotion): implement pop_in, karaoke, bounce, glow_pulse highlight styles + rotation"
+```
+
+---
+
+## Task 3: Implement Word Entrance, Text Transform, and New Transitions
+
+**Files:**
+- Modify: `remotion_service/src/components/Captions.tsx`
+
+- [ ] **Step 1: Add `useVideoConfig` to the Captions component**
+
+At the top of the `Captions` component (around line 186), add fps extraction for `spring()` calls in transitions:
+
+```typescript
+  const { fps: videoFps } = useVideoConfig();
+```
+
+This is needed by `drop_in` transition (Step 4) and must be declared before use.
+
+- [ ] **Step 2: Implement `word_entrance` in StyledWord**
+
+In `StyledWord`, BEFORE the `if (isCurrent)` block (around line 86), add word entrance logic. This controls how words appear before they're spoken. The spring also applies when the word is `isCurrent` (just started being spoken) so the pop-in is smooth regardless of highlight style:
+
+```typescript
+  // Word entrance: controls visibility/scale of words before their start frame
+  const entrance = style.animation.word_entrance ?? "none";
+  if (entrance !== "none" && currentFrame < wordFrameTime.start) {
+    // Word hasn't been spoken yet — hide it
+    if (entrance === "pop") {
+      baseStyle.transform = "scale(0)";
+      baseStyle.opacity = 0;
+    } else if (entrance === "typewriter") {
+      baseStyle.opacity = 0;
+    }
+  } else if (entrance === "pop" && currentFrame >= wordFrameTime.start) {
+    // Word has been spoken (or is being spoken right now) — spring it in
+    const framesSinceStart = currentFrame - wordFrameTime.start;
+    const popScale = spring({
+      fps,
+      frame: framesSinceStart,
+      config: { damping: 12, stiffness: 200 },
+      durationInFrames: 10,
+    });
+    // Set as base transform — highlight styles will append to it if needed
+    baseStyle.transform = `scale(${popScale})`;
+  }
+```
+
+Note: The pop spring applies to BOTH `isCurrent` and past words. When `isCurrent`, the highlight style block below may overwrite `baseStyle.transform` (e.g., bounce sets its own scale). This is intentional — the highlight animation takes precedence once the word is active. For highlight styles that don't set transform (like `color` or `karaoke`), the pop spring provides the entrance animation.
+
+- [ ] **Step 3: Implement `zoom_in` and `drop_in` segment transitions**
+
+In the `Captions` component, after the existing `slide` transition block (around line 247), add both new transitions and a `scale` variable:
+
+```typescript
+  let scale = 1;
+
+  if (transition === "zoom_in" && !hasShortSegment) {
+    opacity = interpolate(
+      currentFrame,
+      hasFadePlateau
+        ? [start, fadeIn, fadeOut, end]
+        : [start, middleFrame, end],
+      hasFadePlateau ? [0, 1, 1, 0] : [0, 1, 0],
+      { extrapolateLeft: "clamp", extrapolateRight: "clamp" },
+    );
+    scale = interpolate(
+      currentFrame,
+      hasFadePlateau
+        ? [start, fadeIn, fadeOut, end]
+        : [start, middleFrame, end],
+      hasFadePlateau ? [0.8, 1, 1, 1.2] : [0.8, 1, 1.2],
+      { extrapolateLeft: "clamp", extrapolateRight: "clamp" },
+    );
+  }
+
+  if (transition === "drop_in" && !hasShortSegment) {
+    opacity = interpolate(
+      currentFrame,
+      hasFadePlateau
+        ? [start, fadeIn, fadeOut, end]
+        : [start, middleFrame, end],
+      hasFadePlateau ? [0, 1, 1, 0] : [0, 1, 0],
+      { extrapolateLeft: "clamp", extrapolateRight: "clamp" },
+    );
+    const dropSpring = spring({
+      fps: videoFps,
+      frame: currentFrame - start,
+      config: { damping: 10, stiffness: 150 },
+      durationInFrames: Math.min(fadeDuration * 3, 20),
+    });
+    // Spring goes 0→1, we want -50→0
+    translateY = interpolate(dropSpring, [0, 1], [-50, 0]);
+  }
+```
+
+Note: `videoFps` comes from `useVideoConfig()` added in Step 1.
+
+- [ ] **Step 4: Implement `text_transform` and update `segmentStyle.transform`**
+
+In the `Captions` component (line 250, inside the inline style mode block), rebuild `segmentStyle` to include `textTransform` and the composite `transform` (accounting for translateY + scale):
+
+```typescript
+    const textTransformValue = styleConfig.animation.text_transform ?? "none";
+
+    const segmentStyle: React.CSSProperties = {
+      opacity,
+      transform: [
+        translateY !== 0 ? `translateY(${translateY}px)` : "",
+        scale !== 1 ? `scale(${scale})` : "",
+      ].filter(Boolean).join(" ") || undefined,
+      display: "flex",
+      flexDirection: "column",
+      alignItems: "center",
+      textAlign: "center",
+      width: "100%",
+      padding: background.bg_padding_px,
+      background: background.bg_color,
+      borderRadius: background.bg_border_radius_px,
+      textTransform: textTransformValue !== "none" ? textTransformValue : undefined,
+    };
+```
+
+- [ ] **Step 5: Update the CSS theme mode segment div to include scale**
+
+In the CSS theme mode return (line 291), update the style to use the same composite transform:
+
+```typescript
+  return (
+    <div
+      className="segment"
+      style={{
+        opacity,
+        transform: [
+          translateY !== 0 ? `translateY(${translateY}px)` : "",
+          scale !== 1 ? `scale(${scale})` : "",
+        ].filter(Boolean).join(" ") || undefined,
+      }}
+    >
+```
+
+- [ ] **Step 6: Type-check**
+
+Run: `cd remotion_service && bunx tsc --noEmit`
+Expected: PASS
+
+- [ ] **Step 7: Visually verify in Remotion Studio**
+
+Run: `cd remotion_service && bun run dev`
+Open Remotion Studio in browser. Test each new animation by modifying the composition props in the studio UI:
+- Set `highlight_style` to each of: `pop_in`, `karaoke`, `bounce`, `glow_pulse`
+- Set `segment_transition` to each of: `zoom_in`, `drop_in`
+- Set `word_entrance` to `pop` and `typewriter`
+- Set `text_transform` to `uppercase`
+- Set `highlight_rotation_deg` to `3`
+
+Verify each renders without errors and the visual effect matches the spec description.
+
+- [ ] **Step 8: Commit**
+
+```bash
+cd remotion_service
+git add src/components/Captions.tsx
+git commit -m "feat(remotion): implement word entrance, text transform, zoom_in and drop_in transitions"
+```
+
+---
+
+## Task 4: Dynamic Font Loading
+
+**Files:**
+- Modify: `remotion_service/src/components/Captions.tsx` (lines 1-12 — imports and font loading)
+
+Currently, only Lobster is loaded (line 3 + line 12). The new presets use Montserrat and Inter. We need to dynamically load the correct font based on `styleConfig.text.font_family`.
+
+- [ ] **Step 1: Replace static font loading with dynamic loading**
+
+Replace lines 1-12 of `Captions.tsx`:
+
+```typescript
+import React from "react";
+import { interpolate, spring, useVideoConfig } from "remotion";
+import { loadFont as loadLobster } from "@remotion/google-fonts/Lobster";
+import { loadFont as loadInter } from "@remotion/google-fonts/Inter";
+import { loadFont as loadMontserrat } from "@remotion/google-fonts/Montserrat";
+import { loadFont as loadRoboto } from "@remotion/google-fonts/Roboto";
+import { loadFont as loadOpenSans } from "@remotion/google-fonts/OpenSans";
+import {
+  LineWithFrames,
+  SegmentWithFrames,
+  WordWithFrames,
+} from "@/types/transcription";
+import { CaptionStyleConfig } from "@/types/caption_style";
+import { useTheme } from "@/hooks/useTheme";
+
+// Load all supported fonts — Remotion deduplicates, only loads what's used
+loadLobster();
+loadInter();
+loadMontserrat();
+loadRoboto();
+loadOpenSans();
+```
+
+Note: Remotion's `loadFont()` is idempotent and only triggers one network request per font. Loading all 5 is safe and keeps the component simple. The alternative (dynamic loading based on styleConfig) adds complexity for no real benefit since all 5 fonts are small.
+
+- [ ] **Step 2: Install missing font packages (if needed)**
+
+Run: `cd remotion_service && bunx tsc --noEmit`
+
+If any `@remotion/google-fonts/*` imports fail, the packages are already included in `@remotion/google-fonts` — they're subpath exports, not separate packages. If there's an error, check:
+
+Run: `cd remotion_service && bun run dev`
+
+The fonts should load in the Remotion Studio preview.
+
+- [ ] **Step 3: Commit**
+
+```bash
+cd remotion_service
+git add src/components/Captions.tsx
+git commit -m "feat(remotion): load Inter, Montserrat, Roboto, OpenSans fonts alongside Lobster"
+```
+
+---
+
+## Task 5: Extend Backend Schema
+
+**Files:**
+- Modify: `cofee_backend/cpv3/modules/captions/schemas.py` (lines 37-42)
+
+- [ ] **Step 1: Update CaptionAnimationStyle Pydantic model**
+
+Replace lines 37-42 in `cofee_backend/cpv3/modules/captions/schemas.py`:
+
+```python
+class CaptionAnimationStyle(Schema):
+    highlight_style: Literal[
+        "color", "scale", "underline", "color_scale",
+        "pop_in", "karaoke", "bounce", "glow_pulse",
+    ] = "color"
+    highlight_scale: float = 1.1
+    segment_transition: Literal["fade", "slide", "none", "zoom_in", "drop_in"] = "fade"
+    fade_duration_frames: int = 3
+    animation_speed: float = 1.0
+    word_entrance: Literal["none", "pop", "typewriter"] = "none"
+    highlight_rotation_deg: float = 0
+    text_transform: Literal["none", "uppercase", "lowercase"] = "none"
+```
+
+- [ ] **Step 2: Lint and type-check**
+
+Run: `cd cofee_backend && uv run ruff check cpv3/modules/captions/schemas.py && uv run ruff format cpv3/modules/captions/schemas.py`
+Expected: PASS or auto-formatted
+
+- [ ] **Step 3: Verify existing tests still pass**
+
+Run: `cd cofee_backend && uv run pytest tests/integration/ -x -q 2>&1 | tail -5`
+Expected: All existing tests pass (new fields have defaults, so backward compatible)
+
+- [ ] **Step 4: Commit**
+
+```bash
+cd cofee_backend
+git add cpv3/modules/captions/schemas.py
+git commit -m "feat(backend): extend CaptionAnimationStyle with new highlight styles, transitions, and fields"
+```
+
+---
+
+## Task 6: Seed New System Presets (Alembic Migration)
+
+**Files:**
+- Create: `cofee_backend/alembic/versions/e6f7a8b9c0d1_seed_shorts_podcast_presets.py`
+
+- [ ] **Step 1: Create migration file**
+
+Create `cofee_backend/alembic/versions/e6f7a8b9c0d1_seed_shorts_podcast_presets.py`:
+
+```python
+"""seed shorts and podcast system presets
+
+Revision ID: e6f7a8b9c0d1
+Revises: d5e6f7a8b9c0
+Create Date: 2026-03-21 12:00:00.000000
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "e6f7a8b9c0d1"
+down_revision: Union[str, None] = "d5e6f7a8b9c0"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+SHORTS_PRESET = {
+    "id": "00000000-0000-4000-a000-000000000004",
+    "user_id": None,
+    "name": "Шортс",
+    "description": "Жирные субтитры для вертикальных видео — TikTok, Reels, Shorts",
+    "is_system": True,
+    "style_config": {
+        "text": {
+            "font_family": "Montserrat",
+            "font_size": 72,
+            "font_weight": 700,
+            "text_color": "#FFFFFF",
+            "highlight_color": "#FFE500",
+            "text_shadow": "3px 3px 0px #000000",
+            "text_stroke_width": 3,
+            "text_stroke_color": "#000000",
+        },
+        "layout": {
+            "vertical_position": "bottom",
+            "horizontal_alignment": "center",
+            "padding_px": 20,
+            "max_width_pct": 85,
+            "lines_per_screen": 1,
+        },
+        "animation": {
+            "highlight_style": "bounce",
+            "highlight_scale": 1.15,
+            "highlight_rotation_deg": 3,
+            "word_entrance": "pop",
+            "segment_transition": "zoom_in",
+            "fade_duration_frames": 3,
+            "animation_speed": 1.0,
+            "text_transform": "uppercase",
+        },
+        "background": {
+            "bg_color": "transparent",
+            "bg_blur_px": 0,
+            "bg_glow_color": None,
+            "bg_border_radius_px": 0,
+            "bg_padding_px": 0,
+        },
+    },
+    "preview_url": None,
+    "is_active": True,
+}
+
+PODCAST_PRESET = {
+    "id": "00000000-0000-4000-a000-000000000005",
+    "user_id": None,
+    "name": "Подкаст",
+    "description": "Чистые субтитры для подкастов и интервью — караоке-подсветка, фон с размытием",
+    "is_system": True,
+    "style_config": {
+        "text": {
+            "font_family": "Inter",
+            "font_size": 44,
+            "font_weight": 400,
+            "text_color": "#E0E0E0",
+            "highlight_color": "#FFFFFF",
+            "text_shadow": "1px 1px 3px rgba(0,0,0,0.7)",
+            "text_stroke_width": 0,
+            "text_stroke_color": "#000000",
+        },
+        "layout": {
+            "vertical_position": "bottom",
+            "horizontal_alignment": "center",
+            "padding_px": 20,
+            "max_width_pct": 90,
+            "lines_per_screen": 2,
+        },
+        "animation": {
+            "highlight_style": "karaoke",
+            "highlight_scale": 1.0,
+            "highlight_rotation_deg": 0,
+            "word_entrance": "none",
+            "segment_transition": "fade",
+            "fade_duration_frames": 5,
+            "animation_speed": 1.0,
+            "text_transform": "none",
+        },
+        "background": {
+            "bg_color": "rgba(0,0,0,0.5)",
+            "bg_blur_px": 8,
+            "bg_glow_color": None,
+            "bg_border_radius_px": 12,
+            "bg_padding_px": 16,
+        },
+    },
+    "preview_url": None,
+    "is_active": True,
+}
+
+
+def upgrade() -> None:
+    # Idempotent: check if presets with these names already exist before inserting
+    conn = op.get_bind()
+
+    caption_presets = sa.table(
+        "caption_presets",
+        sa.column("id", sa.UUID()),
+        sa.column("user_id", sa.UUID()),
+        sa.column("name", sa.String()),
+        sa.column("description", sa.Text()),
+        sa.column("is_system", sa.Boolean()),
+        sa.column("style_config", sa.JSON()),
+        sa.column("preview_url", sa.String()),
+        sa.column("is_active", sa.Boolean()),
+    )
+
+    for preset in [SHORTS_PRESET, PODCAST_PRESET]:
+        exists = conn.execute(
+            sa.select(sa.func.count())
+            .select_from(caption_presets)
+            .where(caption_presets.c.name == preset["name"])
+            .where(caption_presets.c.is_system == True)  # noqa: E712
+        ).scalar()
+        if not exists:
+            op.bulk_insert(caption_presets, [preset])
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    caption_presets = sa.table(
+        "caption_presets",
+        sa.column("id", sa.UUID()),
+        sa.column("is_active", sa.Boolean()),
+    )
+    for preset_id in [SHORTS_PRESET["id"], PODCAST_PRESET["id"]]:
+        conn.execute(
+            caption_presets.update()
+            .where(caption_presets.c.id == preset_id)
+            .values(is_active=False)
+        )
+```
+
+- [ ] **Step 2: Apply migration**
+
+Run: `cd cofee_backend && uv run alembic upgrade head`
+Expected: Migration applies successfully, 2 new rows in `caption_presets`
+
+- [ ] **Step 3: Verify presets exist via API**
+
+Run: `curl -s http://localhost:8000/api/captions/presets/ -H "Authorization: Bearer <token>" | python3 -m json.tool | grep -E '"name"'`
+Expected: Should list "Классические", "Неон", "Минимализм", "Шортс", "Подкаст"
+
+(If backend isn't running, verify via direct DB query instead)
+
+- [ ] **Step 4: Commit**
+
+```bash
+cd cofee_backend
+git add alembic/versions/e6f7a8b9c0d1_seed_shorts_podcast_presets.py
+git commit -m "feat(backend): seed Шортс and Подкаст system presets"
+```
+
+---
+
+## Task 7: Extend Frontend StyleEditor
+
+**Files:**
+- Modify: `cofee_frontend/src/features/project/CaptionSettingsStep/StyleEditor.tsx` (lines 38-71 — FormValues type, lines 73-106 — DEFAULT_VALUES, lines 360-452 — AnimationFields)
+
+- [ ] **Step 1: Regenerate API types from updated backend schema**
+
+Run: `cd cofee_frontend && bun run gen:api-types`
+Expected: `src/shared/api/__generated__/openapi.types.ts` updated with new animation fields
+
+(Backend must be running with the schema changes applied for this to work. If not available, proceed — the local `FormValues` type is what the form uses.)
+
+- [ ] **Step 2: Update FormValues type**
+
+In `StyleEditor.tsx`, replace lines 57-63 (the `animation` section of `FormValues`):
+
+```typescript
+	animation: {
+		highlight_style:
+			| "color"
+			| "scale"
+			| "underline"
+			| "color_scale"
+			| "pop_in"
+			| "karaoke"
+			| "bounce"
+			| "glow_pulse";
+		highlight_scale: number;
+		segment_transition: "fade" | "slide" | "none" | "zoom_in" | "drop_in";
+		fade_duration_frames: number;
+		animation_speed: number;
+		word_entrance: "none" | "pop" | "typewriter";
+		highlight_rotation_deg: number;
+		text_transform: "none" | "uppercase" | "lowercase";
+	}
+```
+
+- [ ] **Step 3: Update DEFAULT_VALUES**
+
+In `StyleEditor.tsx`, replace lines 92-98 (the `animation` section of `DEFAULT_VALUES`):
+
+```typescript
+	animation: {
+		highlight_style: "color" as const,
+		highlight_scale: 1.2,
+		segment_transition: "fade" as const,
+		fade_duration_frames: 5,
+		animation_speed: 1.0,
+		word_entrance: "none" as const,
+		highlight_rotation_deg: 0,
+		text_transform: "none" as const,
+	},
+```
+
+- [ ] **Step 4: Add new options to AnimationFields**
+
+In the `AnimationFields` component, add the 4 new highlight style options to the existing `<Select>` (after line 381, the `color_scale` SelectItem):
+
+```tsx
+						<SelectItem value="pop_in">Появление</SelectItem>
+						<SelectItem value="karaoke">Караоке</SelectItem>
+						<SelectItem value="bounce">Отскок</SelectItem>
+						<SelectItem value="glow_pulse">Свечение</SelectItem>
+```
+
+Add the 2 new segment transition options to the existing `<Select>` (after line 414, the `none` SelectItem):
+
+```tsx
+						<SelectItem value="zoom_in">Приближение</SelectItem>
+						<SelectItem value="drop_in">Выпадание</SelectItem>
+```
+
+- [ ] **Step 5: Add 3 new form fields to AnimationFields**
+
+After the last `<Controller>` in `AnimationFields` (the `animation_speed` slider, ending around line 451), add:
+
+```tsx
+		<Controller
+			name="animation.word_entrance"
+			control={control}
+			render={({ field }) => (
+				<div className={styles.fieldGroup}>
+					<span className={styles.fieldLabel}>Появление слов</span>
+					<Select
+						value={field.value}
+						onValueChange={field.onChange}
+						placeholder="Появление"
+					>
+						<SelectItem value="none">Все сразу</SelectItem>
+						<SelectItem value="pop">Выскакивание</SelectItem>
+						<SelectItem value="typewriter">Печатная машинка</SelectItem>
+					</Select>
+				</div>
+			)}
+		/>
+		<Controller
+			name="animation.highlight_rotation_deg"
+			control={control}
+			render={({ field }) => (
+				<div className={styles.sliderField}>
+					<Slider
+						label="Поворот выделения"
+						unit="°"
+						min={0}
+						max={15}
+						step={1}
+						value={field.value}
+						onChange={field.onChange}
+					/>
+				</div>
+			)}
+		/>
+		<Controller
+			name="animation.text_transform"
+			control={control}
+			render={({ field }) => (
+				<div className={styles.fieldGroup}>
+					<span className={styles.fieldLabel}>Регистр текста</span>
+					<Select
+						value={field.value}
+						onValueChange={field.onChange}
+						placeholder="Регистр"
+					>
+						<SelectItem value="none">Без изменений</SelectItem>
+						<SelectItem value="uppercase">ЗАГЛАВНЫЕ</SelectItem>
+						<SelectItem value="lowercase">строчные</SelectItem>
+					</Select>
+				</div>
+			)}
+		/>
+```
+
+- [ ] **Step 6: Type-check frontend**
+
+Run: `cd cofee_frontend && bunx tsc --noEmit`
+Expected: PASS (or pre-existing type errors only — see memory for known issues)
+
+- [ ] **Step 7: Commit**
+
+```bash
+cd cofee_frontend
+git add src/features/project/CaptionSettingsStep/StyleEditor.tsx
+git commit -m "feat(frontend): add new animation options and fields to StyleEditor"
+```
+
+---
+
+## Task 8: Final Integration Verification
+
+- [ ] **Step 1: Type-check all three projects**
+
+Run these in parallel:
+
+```bash
+cd remotion_service && bunx tsc --noEmit
+cd cofee_frontend && bunx tsc --noEmit
+cd cofee_backend && uv run ruff check cpv3/
+```
+
+Expected: All pass
+
+- [ ] **Step 2: Visual verification in Remotion Studio**
+
+Run: `cd remotion_service && bun run dev`
+
+In the Remotion Studio, test the two preset configs by pasting their `styleConfig` JSON into composition props:
+
+1. **Шортс preset**: Verify uppercase text, words pop in one by one, active word bounces with yellow color + rotation, zoom_in transition between segments
+2. **Подкаст preset**: Verify normal case, karaoke wipe on active word, frosted glass background, fade transition
+
+- [ ] **Step 3: Verify frontend editor shows new options**
+
+Run: `cd cofee_frontend && bun dev`
+
+Open http://localhost:3000, navigate to a project → Caption Settings → create or edit a preset:
+- Verify "Анимация" tab shows all 8 highlight styles, 5 transitions
+- Verify new fields appear: "Появление слов", "Поворот выделения", "Регистр текста"
+- Verify the 2 new system presets ("Шортс", "Подкаст") appear in the preset grid
+
+- [ ] **Step 4: End-to-end render test**
+
+If the full stack is running (backend + remotion service + S3):
+1. Select the "Шортс" preset
+2. Generate captions on a test video
+3. Verify the output video has uppercase text with bounce animation
+4. Repeat with "Подкаст" preset — verify karaoke wipe + frosted glass
+
+- [ ] **Step 5: Final commit (if any fixes were needed)**
+
+```bash
+git add -A
+git commit -m "fix: integration fixes for advanced remotion templates"
+```
diff --git a/docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md b/docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md
new file mode 100644
index 0000000..10dfa18
--- /dev/null
+++ b/docs/superpowers/specs/2026-03-21-advanced-remotion-templates-design.md
@@ -0,0 +1,229 @@
+# Advanced Remotion Templates — Design Spec
+
+## Summary
+
+Extend the Remotion caption animation system with new highlight styles, segment transitions, and per-word entrance effects. Create two polished system presets ("Шортс" and "Подкаст") using the new capabilities. No new Remotion compositions — presets are style configurations within the existing `CaptionedVideo` composition.
+
+## Context
+
+### Current State
+
+- Remotion service renders captions via a single `CaptionedVideo` composition
+- `CaptionStyleSchema` controls all styling: text, layout, animation, background
+- 4 highlight styles: `color`, `scale`, `underline`, `color_scale`
+- 2 segment transitions: `fade`, `slide`, `none`
+- 3 system presets seeded in DB: "Классические", "Неон", "Минимализм"
+- Frontend has preset grid browser + full style editor with live preview
+- Backend preset CRUD is complete with system/user preset separation
+
+### What This Changes
+
+- Adds 4 new highlight styles, 2 new segment transitions, 3 new animation fields
+- Adds 2 new system presets targeting Shorts/Clips and Podcast content creators
+- All changes are additive — existing presets and rendering continue to work unchanged
+
+## Approach
+
+**Extend existing schema (Approach A)** — add new enum values and fields to `CaptionAnimationStyle`. All rendering stays in the single `Captions.tsx` component. Chosen over separate compositions (too much duplication) and plugin architecture (over-engineered for 4-6 new animation types).
+
+## Animation System Extensions
+
+### New `highlight_style` Values
+
+| Style | Visual Effect | Implementation |
+|-------|--------------|----------------|
+| `pop_in` | Each word springs from scale 0→1 when spoken | `spring()` on `transform: scale()` keyed to word start frame |
+| `karaoke` | Color fills word left→right over its duration | CSS `linear-gradient` with `interpolate()` shifting stop from 0%→100% |
+| `bounce` | Active word overshoots scale (1→1.15→1.0) with elastic ease | `spring({ damping: 8 })` on scale, triggers at word start |
+| `glow_pulse` | Active word's text-shadow glow intensity oscillates | `interpolate()` cycling shadow blur/spread over word duration |
+
+### New `segment_transition` Values
+
+| Transition | Visual Effect |
+|-----------|--------------|
+| `zoom_in` | Old segment scales up + fades out, new segment scales 0.8→1 + fades in |
+| `drop_in` | New segment drops from above with spring bounce |
+
+### New Fields on `CaptionAnimationStyle`
+
+| Field | Type | Default | Purpose |
+|-------|------|---------|---------|
+| `word_entrance` | `"none" \| "pop" \| "typewriter"` | `"none"` | How unspoken words appear. `pop`: spring from scale 0→1 at word start. `typewriter`: words become visible sequentially (no scale animation). `none`: all words in segment visible immediately. |
+| `highlight_rotation_deg` | `float` (0–15) | `0` | Rotation in degrees applied to active word via `transform: rotate()` |
+| `text_transform` | `"none" \| "uppercase" \| "lowercase"` | `"none"` | CSS `text-transform` applied to entire caption container |
+
+### Backward Compatibility
+
+All new fields have defaults that match current behavior (`word_entrance: "none"`, `highlight_rotation_deg: 0`, `text_transform: "none"`). Existing presets and inline configs continue to work without changes.
+
+## System Presets
+
+### Preset: "Шортс" (Shorts/Clips)
+
+Target: Bold, high-energy captions for TikTok/Reels/Shorts vertical content.
+
+```json
+{
+  "text": {
+    "font_family": "Montserrat",
+    "font_size": 72,
+    "font_weight": 700,
+    "text_color": "#FFFFFF",
+    "highlight_color": "#FFE500",
+    "text_stroke_width": 3,
+    "text_stroke_color": "#000000",
+    "text_shadow": "3px 3px 0px #000000"
+  },
+  "layout": {
+    "vertical_position": "bottom",
+    "horizontal_alignment": "center",
+    "max_width_pct": 85,
+    "lines_per_screen": 1,
+    "padding_px": 20
+  },
+  "animation": {
+    "highlight_style": "bounce",
+    "highlight_scale": 1.15,
+    "highlight_rotation_deg": 3,
+    "word_entrance": "pop",
+    "segment_transition": "zoom_in",
+    "fade_duration_frames": 3,
+    "animation_speed": 1.0,
+    "text_transform": "uppercase"
+  },
+  "background": {
+    "bg_color": "transparent",
+    "bg_blur_px": 0,
+    "bg_glow_color": null,
+    "bg_border_radius_px": 0,
+    "bg_padding_px": 0
+  }
+}
+```
+
+Key characteristics:
+- All caps, 1 line at a time, no background box
+- Words pop in at full size via spring animation
+- Active word: yellow + 1.15x bounce + 3° rotation + subtle glow
+- Heavy text stroke provides contrast without background
+- Zoom transition between segments
+
+### Preset: "Подкаст" (Podcast)
+
+Target: Clean, professional captions for long-form podcast/interview content.
+
+```json
+{
+  "text": {
+    "font_family": "Inter",
+    "font_size": 44,
+    "font_weight": 400,
+    "text_color": "#E0E0E0",
+    "highlight_color": "#FFFFFF",
+    "text_stroke_width": 0,
+    "text_stroke_color": null,
+    "text_shadow": "1px 1px 3px rgba(0,0,0,0.7)"
+  },
+  "layout": {
+    "vertical_position": "bottom",
+    "horizontal_alignment": "center",
+    "max_width_pct": 90,
+    "lines_per_screen": 2,
+    "padding_px": 20
+  },
+  "animation": {
+    "highlight_style": "karaoke",
+    "highlight_scale": 1.0,
+    "highlight_rotation_deg": 0,
+    "word_entrance": "none",
+    "segment_transition": "fade",
+    "fade_duration_frames": 5,
+    "animation_speed": 1.0,
+    "text_transform": "none"
+  },
+  "background": {
+    "bg_color": "rgba(0,0,0,0.5)",
+    "bg_blur_px": 8,
+    "bg_glow_color": null,
+    "bg_border_radius_px": 12,
+    "bg_padding_px": 16
+  }
+}
+```
+
+Key characteristics:
+- Normal case, 2 lines, frosted glass background
+- Karaoke wipe fills active word left→right with white
+- All words visible — no entrance animation
+- Subtle fade between segments
+- Inter font, soft white for readability
+
+## Changes Per Layer
+
+### Remotion Service (`remotion_service/`)
+
+**`server/types/CaptionStyleSchema.ts`**
+- Extend `highlight_style` union: add `"pop_in" | "karaoke" | "bounce" | "glow_pulse"`
+- Extend `segment_transition` union: add `"zoom_in" | "drop_in"`
+- Add fields: `word_entrance`, `highlight_rotation_deg`, `text_transform` with defaults
+
+**`src/components/Captions.tsx`** (~150 lines added)
+- New rendering branches for each highlight style using `interpolate()` and `spring()`
+- `word_entrance` logic: controls opacity/scale of words before their `wordStartFrame`
+- `highlight_rotation_deg`: applies `transform: rotate()` on active word
+- `text_transform`: CSS `text-transform` on caption container (lives in animation schema because it's applied at render time alongside animation logic)
+- All animations must use Remotion primitives only — no CSS transitions, no Framer Motion
+- Load `Montserrat` and `Inter` via `@remotion/google-fonts` alongside existing `Lobster` — dynamically load based on `styleConfig.text.font_family`
+
+**No changes to:** `Root.tsx`, `Composition.tsx`, `useCaptions.ts`, server endpoints, queue, S3 logic
+
+### Backend (`cofee_backend/`)
+
+**`cpv3/modules/captions/schemas.py`**
+- Extend `CaptionAnimationStyle` Literal types to include new values
+- Add 3 new Optional fields with defaults matching current behavior
+
+**Alembic migration**
+- Seed 2 new system presets ("Шортс", "Подкаст") into `caption_presets` table with `is_system=True`, `user_id=NULL`
+- Seed must be idempotent — check for existing name before inserting to avoid duplicates on re-run
+
+**No changes to:** router, service, repository, task system, webhooks, notifications
+
+### Frontend (`cofee_frontend/`)
+
+**`features/project/CaptionSettingsStep/StyleEditor.tsx`**
+- Add 4 new options to highlight style `<select>`
+- Add 2 new options to segment transition `<select>`
+- Add 3 new form fields: word_entrance `<select>`, rotation slider, text_transform `<select>`
+- Update local `FormValues` type to include new literal values (it duplicates backend types)
+
+**`features/project/CaptionSettingsStep/StylePreview.tsx`** (optional enhancement)
+- Hint at karaoke effect with gradient in static preview
+- Not critical — real preview is the rendered video
+
+**No new components, no new files, no new API endpoints.**
+
+## Data Flow
+
+Unchanged. The existing flow handles this entirely:
+
+1. User picks preset or edits style → `style_config` JSON
+2. Submit → `POST /api/tasks/captions-generate/` with `preset_id` or inline config
+3. Backend resolves config → sends to Remotion service
+4. Remotion reads new fields from `styleConfig`, renders with new animation logic
+5. Output → S3 → webhook → notification → frontend
+
+## Testing
+
+- **Remotion**: Visual testing via `bun run dev` (Remotion Studio) — verify each new animation style renders correctly with sample transcription data
+- **Backend**: Existing integration tests cover preset CRUD — add test cases with new fields to verify persistence and retrieval
+- **Frontend**: Existing E2E covers preset selection flow — verify new select options appear and are selectable
+- **Type-check**: `bunx tsc --noEmit` in both `remotion_service/` and `cofee_frontend/`
+
+## Out of Scope
+
+- New Remotion compositions (only extending existing `CaptionedVideo`)
+- Layout templates (split-screen, PiP, speaker labels)
+- Social media overlays (progress bars, CTAs)
+- Video cropping/resizing
+- Preview rendering in the style editor (static CSS preview is sufficient)