diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..b5091bf70036e463f1c196ee7553304faa4e9893 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/check-en.png filter=lfs diff=lfs merge=lfs -text
+docs/check-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/img001-en.png filter=lfs diff=lfs merge=lfs -text
+docs/img001-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/img004-en.png filter=lfs diff=lfs merge=lfs -text
+docs/img004-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/img005-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/img006-en.png filter=lfs diff=lfs merge=lfs -text
+docs/img006-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/img007-en.png filter=lfs diff=lfs merge=lfs -text
+docs/img007-zh.png filter=lfs diff=lfs merge=lfs -text
+docs/index-en.png filter=lfs diff=lfs merge=lfs -text
+docs/index-zh.png filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3498a5d866161e9f7d7d80c889377bdc540637ee
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,63 @@
+# 构建阶段
+FROM python:3.10-slim-bullseye as builder
+
+# 设置工作目录
+WORKDIR /build
+
+# 安装构建依赖
+RUN apt-get update && apt-get install -y \
+    git \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/*
+
+# 创建虚拟环境
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# 首先安装 PyTorch（因为它是最大的依赖）
+RUN pip install --no-cache-dir torch torchvision torchaudio
+
+# 然后安装其他依赖
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# 运行阶段
+FROM python:3.10-slim-bullseye
+
+# 设置工作目录
+WORKDIR /NarratoAI
+
+# 从builder阶段复制虚拟环境
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# 安装运行时依赖
+RUN apt-get update && apt-get install -y \
+    imagemagick \
+    ffmpeg \
+    wget \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/* \
+    && sed -i '/<policy domain="path" rights="none" pattern="@\*"/d' /etc/ImageMagick-6/policy.xml
+
+# 设置环境变量
+ENV PYTHONPATH="/NarratoAI" \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+
+# 设置目录权限
+RUN chmod 777 /NarratoAI
+
+# 安装git lfs
+RUN git lfs install
+
+# 复制应用代码
+COPY . .
+
+# 暴露端口
+EXPOSE 8501 8080
+
+# 使用脚本作为入口点
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+ENTRYPOINT ["docker-entrypoint.sh"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..84e41e18df659156f0b3910d0ce6720f9c215bb0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 linyq
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README-en.md b/README-en.md
new file mode 100644
index 0000000000000000000000000000000000000000..c45cbb2f675fda1e1b04562da902629eda71b937
--- /dev/null
+++ b/README-en.md
@@ -0,0 +1,115 @@
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">An all-in-one AI-powered tool for film commentary and automated video editing.🎬🎞️ </h3>
+
+
+<h3>📖 English | <a href="README.md">简体中文</a> | <a href="README-ja.md">日本語</a> </h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+
+<a href="https://discord.com/invite/V2pbAqqQNb" target="_blank">💬 Join the open source community to get project updates and the latest news.</a>
+
+<h2><a href="https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg?from=from_copylink" target="_blank">🎉🎉🎉 Official Documentation 🎉🎉🎉</a> </h2>
+<h3>Home</h3>
+
+![](docs/index-en.png)
+
+<h3>Video Review Interface</h3>
+
+![](docs/check-en.png)
+
+</div>
+
+## Latest News
+- 2025.05.11 Released new version 0.6.0, supports **short drama commentary** and optimized editing process
+- 2025.03.06 Released new version 0.5.2, supports DeepSeek R1 and DeepSeek V3 models for short drama mixing
+- 2024.12.16 Released new version 0.3.9, supports Alibaba Qwen2-VL model for video understanding; supports short drama mixing
+- 2024.11.24 Opened Discord community: https://discord.com/invite/V2pbAqqQNb
+- 2024.11.11 Migrated open source community, welcome to join! [Join the official community](https://github.com/linyqh/NarratoAI/wiki)
+- 2024.11.10 Released official documentation, details refer to [Official Documentation](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg)
+- 2024.11.10 Released new version v0.3.5; optimized video editing process,
+
+## Major Benefits 🎉
+From now on, fully support DeepSeek model! Register to enjoy 20 million free tokens (worth 14 yuan platform quota), editing a 10-minute video only costs 0.1 yuan!  
+
+🔥 Quick benefits:  
+1️⃣ Click the link to register: https://cloud.siliconflow.cn/i/pyOKqFCV  
+2️⃣ Log in with your phone number, **be sure to fill in the invitation code: pyOKqFCV**  
+3️⃣ Receive a 14 yuan quota, experience high cost-effective AI editing quickly!  
+
+💡 Low cost, high creativity:  
+Silicon Flow API Key can be integrated with one click, doubling intelligent editing efficiency!  
+(Note: The invitation code is the only proof for benefit collection, automatically credited after registration)  
+
+Immediately take action to unlock your AI productivity with "pyOKqFCV"!
+
+😊 Update Steps:
+Integration Package: Click update.bat one-click update script
+Code Build: Use git pull to fetch the latest code
+
+## Announcement 📢
+_**Note⚠️: Recently, someone has been impersonating the author on x (Twitter) to issue tokens on the pump.fun platform! This is a scam!!! Do not be deceived! Currently, NarratoAI has not made any official promotions on x (Twitter), please be cautious**_
+
+Below is a screenshot of this person's x (Twitter) homepage
+
+<img src="https://github.com/user-attachments/assets/c492ab99-52cd-4ba2-8695-1bd2073ecf12" alt="Screenshot_20250109_114131_Samsung Internet" style="width:30%; height:auto;">
+
+## Future Plans 🥳
+- [x] Windows Integration Pack Release
+- [x] Optimized the story generation process and improved the generation effect
+- [x] Released version 0.3.5 integration package
+- [x] Support Alibaba Qwen2-VL large model for video understanding
+- [x] Support short drama commentary
+  - [x] One-click merge materials
+  - [x] One-click transcription
+  - [x] One-click clear cache
+- [ ] Support exporting to Jianying drafts
+- [X] Support short drama commentary
+- [ ] Character face matching
+- [ ] Support automatic matching based on voiceover, script, and video materials
+- [ ] Support more TTS engines
+- [ ] ...
+
+## System Requirements 📦
+
+- Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required
+- Windows 10/11 or MacOS 11.0 or above
+- [Python 3.12+](https://www.python.org/downloads/)
+
+## Feedback & Suggestions 📢
+
+👏 1. You can submit [issue](https://github.com/linyqh/NarratoAI/issues) or [pull request](https://github.com/linyqh/NarratoAI/pulls)
+
+💬 2. [Join the open source community exchange group](https://github.com/linyqh/NarratoAI/wiki)
+
+📷 3. Follow the official account [NarratoAI助手] to grasp the latest news
+
+## Reference Projects 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 
+
+## Buy the Author a Cup of Coffee ☕️
+<div style="display: flex; justify-content: space-between;">
+  <img src="https://github.com/user-attachments/assets/5038ccfb-addf-4db1-9966-99415989fd0c" alt="Image 1" style="width: 350px; height: 350px; margin: auto;"/>
+  <img src="https://github.com/user-attachments/assets/07d4fd58-02f0-425c-8b59-2ab94b4f09f8" alt="Image 2" style="width: 350px; height: 350px; margin: auto;"/>
+</div>
+
+## License 📝
+
+Click to view [`LICENSE`](LICENSE) file
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
diff --git a/README-ja.md b/README-ja.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd6a8e5f6ba32d7daaf6bad9f5815cef9d39cbcb
--- /dev/null
+++ b/README-ja.md
@@ -0,0 +1,84 @@
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">一体型AI映画解説および自動ビデオ編集ツール🎬🎞️ </h3>
+
+<h3>📖 <a href="README-cn.md">简体中文</a> | <a href="README.md">English</a> | 日本語 </h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAIは、LLMを活用してスクリプト作成、自動ビデオ編集、ナレーション、字幕生成の一体型ソリューションを提供する自動化ビデオナレーションツールです。
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+
+<a href="https://discord.gg/uVAJftcm" target="_blank">💬 Discordオープンソースコミュニティに参加して、プロジェクトの最新情報を入手しましょう。</a>
+
+<h2><a href="https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg?from=from_copylink" target="_blank">🎉🎉🎉 公式ドキュメント 🎉🎉🎉</a> </h2>
+<h3>ホーム</h3>
+
+![](docs/index-zh.png)
+
+<h3>ビデオレビューインターフェース</h3>
+
+![](docs/check-zh.png)
+
+</div>
+
+## 最新情報
+- 2024.11.24 Discordコミュニティ開設：https://discord.gg/uVAJftcm
+- 2024.11.11 オープンソースコミュニティに移行、参加を歓迎します！ [公式コミュニティに参加](https://github.com/linyqh/NarratoAI/wiki)
+- 2024.11.10 公式ドキュメント公開、詳細は [公式ドキュメント](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg) を参照
+- 2024.11.10 新バージョンv0.3.5リリース；ビデオ編集プロセスの最適化
+
+## 今後の計画 🥳
+- [x] Windows統合パックリリース
+- [x] ストーリー生成プロセスの最適化、生成効果の向上
+- [x] バージョン0.3.5統合パックリリース
+- [x] アリババQwen2-VL大規模モデルのビデオ理解サポート
+- [x] 短編ドラマの解説サポート
+  - [x] 一クリックで素材を統合
+  - [x] 一クリックで文字起こし
+  - [x] 一クリックでキャッシュをクリア
+- [ ] ジャン映草稿のエクスポートをサポート
+- [ ] 主役の顔のマッチング
+- [ ] 音声、スクリプト、ビデオ素材に基づいて自動マッチングをサポート
+- [ ] より多くのTTSエンジンをサポート
+- [ ] ...
+
+## システム要件 📦
+
+- 推奨最低：CPU 4コア以上、メモリ8GB以上、GPUは必須ではありません
+- Windows 10またはMacOS 11.0以上
+
+## フィードバックと提案 📢
+
+👏 1. [issue](https://github.com/linyqh/NarratoAI/issues)または[pull request](https://github.com/linyqh/NarratoAI/pulls)を提出できます
+
+💬 2. [オープンソースコミュニティ交流グループに参加](https://github.com/linyqh/NarratoAI/wiki)
+
+📷 3. 公式アカウント【NarratoAI助手】をフォローして最新情報を入手
+
+## 参考プロジェクト 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+このプロジェクトは上記のプロジェクトを基にリファクタリングされ、映画解説機能が追加されました。オリジナルの作者に感謝します 🥳🥳🥳 
+
+## 作者にコーヒーを一杯おごる ☕️
+<div style="display: flex; justify-content: space-between;">
+  <img src="https://github.com/user-attachments/assets/5038ccfb-addf-4db1-9966-99415989fd0c" alt="Image 1" style="width: 350px; height: 350px; margin: auto;"/>
+  <img src="https://github.com/user-attachments/assets/07d4fd58-02f0-425c-8b59-2ab94b4f09f8" alt="Image 2" style="width: 350px; height: 350px; margin: auto;"/>
+</div>
+
+## ライセンス 📝
+
+[`LICENSE`](LICENSE) ファイルをクリックして表示
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
diff --git a/README.md b/README.md
index 465ef4726b9fb8de33f42060aab888987e4e983a..24da0a1c59b99ef87ea4038a2e6748b737886b75 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,118 @@
----
-title: Aegwe4
-emoji: 🔥
-colorFrom: blue
-colorTo: indigo
-sdk: docker
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>
+
+
+<h3>📖 <a href="README-en.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程，助力高效内容创作。
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+
+<a href="https://discord.com/invite/V2pbAqqQNb" target="_blank">💬 加入 discord 开源社区，获取项目动态和最新资讯。</a>
+
+<h2><a href="https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg?from=from_copylink" target="_blank">🎉🎉🎉 官方文档 🎉🎉🎉</a> </h2>
+<h3>首页</h3>
+
+![](docs/index-zh.png)
+
+<h3>视频审查界面</h3>
+
+![](docs/check-zh.png)
+
+</div>
+
+## 最新资讯
+- 2025.05.11 发布新版本 0.6.0，支持 **短剧解说** 和 优化剪辑流程
+- 2025.03.06 发布新版本 0.5.2，支持 DeepSeek R1 和 DeepSeek V3 模型进行短剧混剪
+- 2024.12.16 发布新版本 0.3.9，支持阿里 Qwen2-VL 模型理解视频；支持短剧混剪
+- 2024.11.24 开通 discord 社群：https://discord.com/invite/V2pbAqqQNb
+- 2024.11.11 迁移开源社群，欢迎加入！ [加入官方社群](https://github.com/linyqh/NarratoAI/wiki)
+- 2024.11.10 发布官方文档，详情参见 [官方文档](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg)
+- 2024.11.10 发布新版本 v0.3.5；优化视频剪辑流程，
+
+## 重磅福利 🎉
+即日起全面支持DeepSeek模型！注册即享2000万免费Token（价值14元平台配额），剪辑10分钟视频仅需0.1元！  
+
+🔥 快速领福利：  
+1️⃣ 点击链接注册：https://cloud.siliconflow.cn/i/pyOKqFCV  
+2️⃣ 使用手机号登录，**务必填写邀请码：pyOKqFCV**  
+3️⃣ 领取14元配额，极速体验高性价比AI剪辑  
+
+💡 小成本大创作：  
+硅基流动API Key一键接入，智能剪辑效率翻倍！  
+（注：邀请码为福利领取唯一凭证，注册后自动到账）  
+
+立即行动，用「pyOKqFCV」解锁你的AI生产力！
+
+😊 更新步骤：
+整合包：点击 update.bat 一键更新脚本
+代码构建：使用 git pull 拉去最新代码
+
+## 公告 📢
+_**注意⚠️：近期在 x (推特) 上发现有人冒充作者在 pump.fun 平台上发行代币！ 这是骗子！！！ 不要被割了韭菜
+！！！目前 NarratoAI 没有在 x(推特) 上做任何官方宣传，注意甄别**_
+
+下面是此人 x(推特) 首页截图
+
+<img src="https://github.com/user-attachments/assets/c492ab99-52cd-4ba2-8695-1bd2073ecf12" alt="Screenshot_20250109_114131_Samsung Internet" style="width:30%; height:auto;">
+
+## 未来计划 🥳
+- [x] windows 整合包发布
+- [x] 优化剧情生成流程，提升生成效果
+- [x] 发布 0.3.5 整合包
+- [x] 支持阿里 Qwen2-VL 大模型理解视频
+- [x] 支持短剧混剪
+  - [x] 一键合并素材
+  - [x] 一键转录
+  - [x] 一键清理缓存
+- [ ] 支持导出剪映草稿
+- [X] 支持短剧解说
+- [ ] 主角人脸匹配
+- [ ] 支持根据口播，文案，视频素材自动匹配
+- [ ] 支持更多 TTS 引擎
+- [ ] ...
+
+## 配置要求 📦
+
+- 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
+- Windows 10/11 或 MacOS 11.0 以上系统
+- [Python 3.12+](https://www.python.org/downloads/)
+
+## 反馈建议 📢
+
+👏 1. 可以提交 [issue](https://github.com/linyqh/NarratoAI/issues)或者 [pull request](https://github.com/linyqh/NarratoAI/pulls)
+
+💬 2. [加入开源社区交流群](https://github.com/linyqh/NarratoAI/wiki)
+
+📷 3. 关注公众号【NarratoAI助手】，掌握最新资讯
+
+## 参考项目 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+该项目基于以上项目重构而来，增加了影视解说功能，感谢大佬的开源精神 🥳🥳🥳 
+
+## 请作者喝一杯咖啡 ☕️
+<div style="display: flex; justify-content: space-between;">
+  <img src="https://github.com/user-attachments/assets/5038ccfb-addf-4db1-9966-99415989fd0c" alt="Image 1" style="width: 350px; height: 350px; margin: auto;"/>
+  <img src="https://github.com/user-attachments/assets/07d4fd58-02f0-425c-8b59-2ab94b4f09f8" alt="Image 2" style="width: 350px; height: 350px; margin: auto;"/>
+</div>
+
+## 许可证 📝
+
+点击查看 [`LICENSE`](LICENSE) 文件
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
+
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/app/asgi.py b/app/asgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac06685a27e86838ec8b392c50742d3b3de548d2
--- /dev/null
+++ b/app/asgi.py
@@ -0,0 +1,90 @@
+"""Application implementation - ASGI."""
+
+import os
+
+from fastapi import FastAPI, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from loguru import logger
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.config import config
+from app.models.exception import HttpException
+from app.router import root_api_router
+from app.utils import utils
+from app.utils import ffmpeg_utils
+
+
+def exception_handler(request: Request, e: HttpException):
+    return JSONResponse(
+        status_code=e.status_code,
+        content=utils.get_response(e.status_code, e.data, e.message),
+    )
+
+
+def validation_exception_handler(request: Request, e: RequestValidationError):
+    return JSONResponse(
+        status_code=400,
+        content=utils.get_response(
+            status=400, data=e.errors(), message="field required"
+        ),
+    )
+
+
+def get_application() -> FastAPI:
+    """Initialize FastAPI application.
+
+    Returns:
+       FastAPI: Application object instance.
+
+    """
+    instance = FastAPI(
+        title=config.project_name,
+        description=config.project_description,
+        version=config.project_version,
+        debug=False,
+    )
+    instance.include_router(root_api_router)
+    instance.add_exception_handler(HttpException, exception_handler)
+    instance.add_exception_handler(RequestValidationError, validation_exception_handler)
+    return instance
+
+
+app = get_application()
+
+# Configures the CORS middleware for the FastAPI app
+cors_allowed_origins_str = os.getenv("CORS_ALLOWED_ORIGINS", "")
+origins = cors_allowed_origins_str.split(",") if cors_allowed_origins_str else ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+task_dir = utils.task_dir()
+app.mount(
+    "/tasks", StaticFiles(directory=task_dir, html=True, follow_symlink=True), name=""
+)
+
+public_dir = utils.public_dir()
+app.mount("/", StaticFiles(directory=public_dir, html=True), name="")
+
+
+@app.on_event("shutdown")
+def shutdown_event():
+    logger.info("shutdown event")
+
+
+@app.on_event("startup")
+def startup_event():
+    logger.info("startup event")
+
+    # 检测FFmpeg硬件加速
+    hwaccel_info = ffmpeg_utils.detect_hardware_acceleration()
+    if hwaccel_info["available"]:
+        logger.info(f"FFmpeg硬件加速检测结果: 可用 | 类型: {hwaccel_info['type']} | 编码器: {hwaccel_info['encoder']} | 独立显卡: {hwaccel_info['is_dedicated_gpu']} | 参数: {hwaccel_info['hwaccel_args']}")
+    else:
+        logger.warning(f"FFmpeg硬件加速不可用: {hwaccel_info['message']}, 将使用CPU软件编码")
diff --git a/app/config/__init__.py b/app/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd468128b7999ad261d03bf81a7753d5843a882e
--- /dev/null
+++ b/app/config/__init__.py
@@ -0,0 +1,56 @@
+import os
+import sys
+
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+
+def __init_logger():
+    # _log_file = utils.storage_dir("logs/server.log")
+    _lvl = config.log_level
+    root_dir = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    )
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        _format = (
+            "<green>{time:%Y-%m-%d %H:%M:%S}</> | "
+            + "<level>{level}</> | "
+            + '"{file.path}:{line}":<blue> {function}</> '
+            + "- <level>{message}</>"
+            + "\n"
+        )
+        return _format
+
+    logger.remove()
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+    # logger.add(
+    #     _log_file,
+    #     level=_lvl,
+    #     format=format_record,
+    #     rotation="00:00",
+    #     retention="3 days",
+    #     backtrace=True,
+    #     diagnose=True,
+    #     enqueue=True,
+    # )
+
+
+__init_logger()
diff --git a/app/config/config.py b/app/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b2b0b4fd9d95872582470d82f2f0707670ecf51
--- /dev/null
+++ b/app/config/config.py
@@ -0,0 +1,85 @@
+import os
+import socket
+import toml
+import shutil
+from loguru import logger
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+config_file = f"{root_dir}/config.toml"
+version_file = f"{root_dir}/project_version"
+
+
+def get_version_from_file():
+    """从project_version文件中读取版本号"""
+    try:
+        if os.path.isfile(version_file):
+            with open(version_file, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        return "0.1.0"  # 默认版本号
+    except Exception as e:
+        logger.error(f"读取版本号文件失败: {str(e)}")
+        return "0.1.0"  # 默认版本号
+
+
+def load_config():
+    # fix: IsADirectoryError: [Errno 21] Is a directory: '/NarratoAI/config.toml'
+    if os.path.isdir(config_file):
+        shutil.rmtree(config_file)
+
+    if not os.path.isfile(config_file):
+        example_file = f"{root_dir}/config.example.toml"
+        if os.path.isfile(example_file):
+            shutil.copyfile(example_file, config_file)
+            logger.info(f"copy config.example.toml to config.toml")
+
+    logger.info(f"load config from file: {config_file}")
+
+    try:
+        _config_ = toml.load(config_file)
+    except Exception as e:
+        logger.warning(f"load config failed: {str(e)}, try to load as utf-8-sig")
+        with open(config_file, mode="r", encoding="utf-8-sig") as fp:
+            _cfg_content = fp.read()
+            _config_ = toml.loads(_cfg_content)
+    return _config_
+
+
+def save_config():
+    with open(config_file, "w", encoding="utf-8") as f:
+        _cfg["app"] = app
+        _cfg["azure"] = azure
+        _cfg["ui"] = ui
+        f.write(toml.dumps(_cfg))
+
+
+_cfg = load_config()
+app = _cfg.get("app", {})
+whisper = _cfg.get("whisper", {})
+proxy = _cfg.get("proxy", {})
+azure = _cfg.get("azure", {})
+ui = _cfg.get("ui", {})
+frames = _cfg.get("frames", {})
+
+hostname = socket.gethostname()
+
+log_level = _cfg.get("log_level", "DEBUG")
+listen_host = _cfg.get("listen_host", "0.0.0.0")
+listen_port = _cfg.get("listen_port", 8080)
+project_name = _cfg.get("project_name", "NarratoAI")
+project_description = _cfg.get(
+    "project_description",
+    "<a href='https://github.com/linyqh/NarratoAI'>https://github.com/linyqh/NarratoAI</a>",
+)
+# 从文件读取版本号，而不是从配置文件中获取
+project_version = get_version_from_file()
+reload_debug = False
+
+imagemagick_path = app.get("imagemagick_path", "")
+if imagemagick_path and os.path.isfile(imagemagick_path):
+    os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path
+
+ffmpeg_path = app.get("ffmpeg_path", "")
+if ffmpeg_path and os.path.isfile(ffmpeg_path):
+    os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path
+
+logger.info(f"{project_name} v{project_version}")
diff --git a/app/controllers/base.py b/app/controllers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..122e341e07529b720f6a422d57f1f7a4d80a6638
--- /dev/null
+++ b/app/controllers/base.py
@@ -0,0 +1,31 @@
+from uuid import uuid4
+
+from fastapi import Request
+
+from app.config import config
+from app.models.exception import HttpException
+
+
+def get_task_id(request: Request):
+    task_id = request.headers.get("x-task-id")
+    if not task_id:
+        task_id = uuid4()
+    return str(task_id)
+
+
+def get_api_key(request: Request):
+    api_key = request.headers.get("x-api-key")
+    return api_key
+
+
+def verify_token(request: Request):
+    token = get_api_key(request)
+    if token != config.app.get("api_key", ""):
+        request_id = get_task_id(request)
+        request_url = request.url
+        user_agent = request.headers.get("user-agent")
+        raise HttpException(
+            task_id=request_id,
+            status_code=401,
+            message=f"invalid token: {request_url}, {user_agent}",
+        )
diff --git a/app/controllers/manager/base_manager.py b/app/controllers/manager/base_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..462589e14fc642c1cc10dea29096e6e730e64f4d
--- /dev/null
+++ b/app/controllers/manager/base_manager.py
@@ -0,0 +1,64 @@
+import threading
+from typing import Callable, Any, Dict
+
+
+class TaskManager:
+    def __init__(self, max_concurrent_tasks: int):
+        self.max_concurrent_tasks = max_concurrent_tasks
+        self.current_tasks = 0
+        self.lock = threading.Lock()
+        self.queue = self.create_queue()
+
+    def create_queue(self):
+        raise NotImplementedError()
+
+    def add_task(self, func: Callable, *args: Any, **kwargs: Any):
+        with self.lock:
+            if self.current_tasks < self.max_concurrent_tasks:
+                print(f"add task: {func.__name__}, current_tasks: {self.current_tasks}")
+                self.execute_task(func, *args, **kwargs)
+            else:
+                print(
+                    f"enqueue task: {func.__name__}, current_tasks: {self.current_tasks}"
+                )
+                self.enqueue({"func": func, "args": args, "kwargs": kwargs})
+
+    def execute_task(self, func: Callable, *args: Any, **kwargs: Any):
+        thread = threading.Thread(
+            target=self.run_task, args=(func, *args), kwargs=kwargs
+        )
+        thread.start()
+
+    def run_task(self, func: Callable, *args: Any, **kwargs: Any):
+        try:
+            with self.lock:
+                self.current_tasks += 1
+            func(*args, **kwargs)  # 在这里调用函数，传递*args和**kwargs
+        finally:
+            self.task_done()
+
+    def check_queue(self):
+        with self.lock:
+            if (
+                self.current_tasks < self.max_concurrent_tasks
+                and not self.is_queue_empty()
+            ):
+                task_info = self.dequeue()
+                func = task_info["func"]
+                args = task_info.get("args", ())
+                kwargs = task_info.get("kwargs", {})
+                self.execute_task(func, *args, **kwargs)
+
+    def task_done(self):
+        with self.lock:
+            self.current_tasks -= 1
+        self.check_queue()
+
+    def enqueue(self, task: Dict):
+        raise NotImplementedError()
+
+    def dequeue(self):
+        raise NotImplementedError()
+
+    def is_queue_empty(self):
+        raise NotImplementedError()
diff --git a/app/controllers/manager/memory_manager.py b/app/controllers/manager/memory_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf7321f56e3e2dde3177d304f3860185b1ab8baa
--- /dev/null
+++ b/app/controllers/manager/memory_manager.py
@@ -0,0 +1,18 @@
+from queue import Queue
+from typing import Dict
+
+from app.controllers.manager.base_manager import TaskManager
+
+
+class InMemoryTaskManager(TaskManager):
+    def create_queue(self):
+        return Queue()
+
+    def enqueue(self, task: Dict):
+        self.queue.put(task)
+
+    def dequeue(self):
+        return self.queue.get()
+
+    def is_queue_empty(self):
+        return self.queue.empty()
diff --git a/app/controllers/manager/redis_manager.py b/app/controllers/manager/redis_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad1912aefb95fa6a6ad16b3617dac08729c9cd1
--- /dev/null
+++ b/app/controllers/manager/redis_manager.py
@@ -0,0 +1,56 @@
+import json
+from typing import Dict
+
+import redis
+
+from app.controllers.manager.base_manager import TaskManager
+from app.models.schema import VideoParams
+from app.services import task as tm
+
+FUNC_MAP = {
+    "start": tm.start,
+    # 'start_test': tm.start_test
+}
+
+
+class RedisTaskManager(TaskManager):
+    def __init__(self, max_concurrent_tasks: int, redis_url: str):
+        self.redis_client = redis.Redis.from_url(redis_url)
+        super().__init__(max_concurrent_tasks)
+
+    def create_queue(self):
+        return "task_queue"
+
+    def enqueue(self, task: Dict):
+        task_with_serializable_params = task.copy()
+
+        if "params" in task["kwargs"] and isinstance(
+            task["kwargs"]["params"], VideoParams
+        ):
+            task_with_serializable_params["kwargs"]["params"] = task["kwargs"][
+                "params"
+            ].dict()
+
+        # 将函数对象转换为其名称
+        task_with_serializable_params["func"] = task["func"].__name__
+        self.redis_client.rpush(self.queue, json.dumps(task_with_serializable_params))
+
+    def dequeue(self):
+        task_json = self.redis_client.lpop(self.queue)
+        if task_json:
+            task_info = json.loads(task_json)
+            # 将函数名称转换回函数对象
+            task_info["func"] = FUNC_MAP[task_info["func"]]
+
+            if "params" in task_info["kwargs"] and isinstance(
+                task_info["kwargs"]["params"], dict
+            ):
+                task_info["kwargs"]["params"] = VideoParams(
+                    **task_info["kwargs"]["params"]
+                )
+
+            return task_info
+        return None
+
+    def is_queue_empty(self):
+        return self.redis_client.llen(self.queue) == 0
diff --git a/app/controllers/ping.py b/app/controllers/ping.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3eeff01adfc9442e8668655d3df5bbf66fb84e3
--- /dev/null
+++ b/app/controllers/ping.py
@@ -0,0 +1,14 @@
+from fastapi import APIRouter
+from fastapi import Request
+
+router = APIRouter()
+
+
+@router.get(
+    "/ping",
+    tags=["Health Check"],
+    description="检查服务可用性",
+    response_description="pong",
+)
+def ping(request: Request) -> str:
+    return "pong"
diff --git a/app/controllers/v1/base.py b/app/controllers/v1/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..51794dff416898c0d0ac07e7ef2052231b02a98c
--- /dev/null
+++ b/app/controllers/v1/base.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def new_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V1"]
+    router.prefix = "/api/v1"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
diff --git a/app/controllers/v1/llm.py b/app/controllers/v1/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5da6aeb36c1484e09e0d3a1bbb2a49e7ac85789
--- /dev/null
+++ b/app/controllers/v1/llm.py
@@ -0,0 +1,93 @@
+from fastapi import Request, File, UploadFile
+import os
+from app.controllers.v1.base import new_router
+from app.models.schema import (
+    VideoScriptResponse,
+    VideoScriptRequest,
+    VideoTermsResponse,
+    VideoTermsRequest,
+    VideoTranscriptionRequest,
+    VideoTranscriptionResponse,
+)
+from app.services import llm
+from app.utils import utils
+from app.config import config
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+# 定义上传目录
+UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "uploads")
+
+@router.post(
+    "/scripts",
+    response_model=VideoScriptResponse,
+    summary="Create a script for the video",
+)
+def generate_video_script(request: Request, body: VideoScriptRequest):
+    video_script = llm.generate_script(
+        video_subject=body.video_subject,
+        language=body.video_language,
+        paragraph_number=body.paragraph_number,
+    )
+    response = {"video_script": video_script}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/terms",
+    response_model=VideoTermsResponse,
+    summary="Generate video terms based on the video script",
+)
+def generate_video_terms(request: Request, body: VideoTermsRequest):
+    video_terms = llm.generate_terms(
+        video_subject=body.video_subject,
+        video_script=body.video_script,
+        amount=body.amount,
+    )
+    response = {"video_terms": video_terms}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/transcription",
+    response_model=VideoTranscriptionResponse, 
+    summary="Transcribe video content using Gemini"
+)
+async def transcribe_video(
+    request: Request,
+    video_name: str,
+    language: str = "zh-CN",
+    video_file: UploadFile = File(...)
+):
+    """
+    使用 Gemini 转录视频内容,包括时间戳、画面描述和语音内容
+    
+    Args:
+        video_name: 视频名称
+        language: 语言代码,默认zh-CN
+        video_file: 上传的视频文件
+    """
+    # 创建临时目录用于存储上传的视频
+    os.makedirs(UPLOAD_DIR, exist_ok=True)
+    
+    # 保存上传的视频文件
+    video_path = os.path.join(UPLOAD_DIR, video_file.filename)
+    with open(video_path, "wb") as buffer:
+        content = await video_file.read()
+        buffer.write(content)
+    
+    try:
+        transcription = llm.gemini_video_transcription(
+            video_name=video_name,
+            video_path=video_path,
+            language=language,
+            llm_provider_video=config.app.get("video_llm_provider", "gemini")
+        )
+        response = {"transcription": transcription}
+        return utils.get_response(200, response)
+    finally:
+        # 处理完成后删除临时文件
+        if os.path.exists(video_path):
+            os.remove(video_path)
diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..336084f4c85ab5997b3869c23afe5d0ea87e66f1
--- /dev/null
+++ b/app/controllers/v1/video.py
@@ -0,0 +1,271 @@
+import glob
+import os
+import pathlib
+import shutil
+from typing import Union
+
+from fastapi import BackgroundTasks, Depends, Path, Request, UploadFile
+from fastapi.params import File
+from fastapi.responses import FileResponse, StreamingResponse
+from loguru import logger
+
+from app.config import config
+from app.controllers import base
+from app.controllers.manager.memory_manager import InMemoryTaskManager
+from app.controllers.manager.redis_manager import RedisTaskManager
+from app.controllers.v1.base import new_router
+from app.models.exception import HttpException
+from app.models.schema import (
+    AudioRequest,
+    BgmRetrieveResponse,
+    BgmUploadResponse,
+    SubtitleRequest,
+    TaskDeletionResponse,
+    TaskQueryRequest,
+    TaskQueryResponse,
+    TaskResponse,
+    TaskVideoRequest,
+)
+from app.services import state as sm
+from app.services import task as tm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+_max_concurrent_tasks = config.app.get("max_concurrent_tasks", 5)
+
+redis_url = f"redis://:{_redis_password}@{_redis_host}:{_redis_port}/{_redis_db}"
+# 根据配置选择合适的任务管理器
+if _enable_redis:
+    task_manager = RedisTaskManager(
+        max_concurrent_tasks=_max_concurrent_tasks, redis_url=redis_url
+    )
+else:
+    task_manager = InMemoryTaskManager(max_concurrent_tasks=_max_concurrent_tasks)
+
+
+@router.post("/videos", response_model=TaskResponse, summary="Generate a short video")
+def create_video(
+    background_tasks: BackgroundTasks, request: Request, body: TaskVideoRequest
+):
+    return create_task(request, body, stop_at="video")
+
+
+@router.post("/subtitle", response_model=TaskResponse, summary="Generate subtitle only")
+def create_subtitle(
+    background_tasks: BackgroundTasks, request: Request, body: SubtitleRequest
+):
+    return create_task(request, body, stop_at="subtitle")
+
+
+@router.post("/audio", response_model=TaskResponse, summary="Generate audio only")
+def create_audio(
+    background_tasks: BackgroundTasks, request: Request, body: AudioRequest
+):
+    return create_task(request, body, stop_at="audio")
+
+
+def create_task(
+    request: Request,
+    body: Union[TaskVideoRequest, SubtitleRequest, AudioRequest],
+    stop_at: str,
+):
+    task_id = utils.get_uuid()
+    request_id = base.get_task_id(request)
+    try:
+        task = {
+            "task_id": task_id,
+            "request_id": request_id,
+            "params": body.model_dump(),
+        }
+        sm.state.update_task(task_id)
+        task_manager.add_task(tm.start, task_id=task_id, params=body, stop_at=stop_at)
+        logger.success(f"Task created: {utils.to_json(task)}")
+        return utils.get_response(200, task)
+    except ValueError as e:
+        raise HttpException(
+            task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}"
+        )
+
+
+@router.get(
+    "/tasks/{task_id}", response_model=TaskQueryResponse, summary="Query task status"
+)
+def get_task(
+    request: Request,
+    task_id: str = Path(..., description="Task ID"),
+    query: TaskQueryRequest = Depends(),
+):
+    endpoint = config.app.get("endpoint", "")
+    if not endpoint:
+        endpoint = str(request.base_url)
+    endpoint = endpoint.rstrip("/")
+
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        task_dir = utils.task_dir()
+
+        def file_to_uri(file):
+            if not file.startswith(endpoint):
+                _uri_path = v.replace(task_dir, "tasks").replace("\\", "/")
+                _uri_path = f"{endpoint}/{_uri_path}"
+            else:
+                _uri_path = file
+            return _uri_path
+
+        if "videos" in task:
+            videos = task["videos"]
+            urls = []
+            for v in videos:
+                urls.append(file_to_uri(v))
+            task["videos"] = urls
+        if "combined_videos" in task:
+            combined_videos = task["combined_videos"]
+            urls = []
+            for v in combined_videos:
+                urls.append(file_to_uri(v))
+            task["combined_videos"] = urls
+        return utils.get_response(200, task)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+@router.delete(
+    "/tasks/{task_id}",
+    response_model=TaskDeletionResponse,
+    summary="Delete a generated short video task",
+)
+def delete_video(request: Request, task_id: str = Path(..., description="Task ID")):
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        tasks_dir = utils.task_dir()
+        current_task_dir = os.path.join(tasks_dir, task_id)
+        if os.path.exists(current_task_dir):
+            shutil.rmtree(current_task_dir)
+
+        sm.state.delete_task(task_id)
+        logger.success(f"video deleted: {utils.to_json(task)}")
+        return utils.get_response(200)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+# @router.get(
+#     "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+# )
+# def get_bgm_list(request: Request):
+#     suffix = "*.mp3"
+#     song_dir = utils.song_dir()
+#     files = glob.glob(os.path.join(song_dir, suffix))
+#     bgm_list = []
+#     for file in files:
+#         bgm_list.append(
+#             {
+#                 "name": os.path.basename(file),
+#                 "size": os.path.getsize(file),
+#                 "file": file,
+#             }
+#         )
+#     response = {"files": bgm_list}
+#     return utils.get_response(200, response)
+#
+
+# @router.post(
+#     "/musics",
+#     response_model=BgmUploadResponse,
+#     summary="Upload the BGM file to the songs directory",
+# )
+# def upload_bgm_file(request: Request, file: UploadFile = File(...)):
+#     request_id = base.get_task_id(request)
+#     # check file ext
+#     if file.filename.endswith("mp3"):
+#         song_dir = utils.song_dir()
+#         save_path = os.path.join(song_dir, file.filename)
+#         # save file
+#         with open(save_path, "wb+") as buffer:
+#             # If the file already exists, it will be overwritten
+#             file.file.seek(0)
+#             buffer.write(file.file.read())
+#         response = {"file": save_path}
+#         return utils.get_response(200, response)
+#
+#     raise HttpException(
+#         "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+#     )
+#
+#
+# @router.get("/stream/{file_path:path}")
+# async def stream_video(request: Request, file_path: str):
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     range_header = request.headers.get("Range")
+#     video_size = os.path.getsize(video_path)
+#     start, end = 0, video_size - 1
+#
+#     length = video_size
+#     if range_header:
+#         range_ = range_header.split("bytes=")[1]
+#         start, end = [int(part) if part else None for part in range_.split("-")]
+#         if start is None:
+#             start = video_size - end
+#             end = video_size - 1
+#         if end is None:
+#             end = video_size - 1
+#         length = end - start + 1
+#
+#     def file_iterator(file_path, offset=0, bytes_to_read=None):
+#         with open(file_path, "rb") as f:
+#             f.seek(offset, os.SEEK_SET)
+#             remaining = bytes_to_read or video_size
+#             while remaining > 0:
+#                 bytes_to_read = min(4096, remaining)
+#                 data = f.read(bytes_to_read)
+#                 if not data:
+#                     break
+#                 remaining -= len(data)
+#                 yield data
+#
+#     response = StreamingResponse(
+#         file_iterator(video_path, start, length), media_type="video/mp4"
+#     )
+#     response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+#     response.headers["Accept-Ranges"] = "bytes"
+#     response.headers["Content-Length"] = str(length)
+#     response.status_code = 206  # Partial Content
+#
+#     return response
+#
+#
+# @router.get("/download/{file_path:path}")
+# async def download_video(_: Request, file_path: str):
+#     """
+#     download video
+#     :param _: Request request
+#     :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+#     :return: video file
+#     """
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     file_path = pathlib.Path(video_path)
+#     filename = file_path.stem
+#     extension = file_path.suffix
+#     headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+#     return FileResponse(
+#         path=video_path,
+#         headers=headers,
+#         filename=f"{filename}{extension}",
+#         media_type=f"video/{extension[1:]}",
+#     )
diff --git a/app/controllers/v2/base.py b/app/controllers/v2/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4612983f17557473d62191d13b0086017cd4bf00
--- /dev/null
+++ b/app/controllers/v2/base.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def v2_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V2"]
+    router.prefix = "/api/v2"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3501eb89784f4c96a0e1bb000aaf5d03e8aa949
--- /dev/null
+++ b/app/controllers/v2/script.py
@@ -0,0 +1,170 @@
+from fastapi import APIRouter, BackgroundTasks
+from loguru import logger
+import os
+
+from app.models.schema_v2 import (
+    GenerateScriptRequest, 
+    GenerateScriptResponse,
+    CropVideoRequest,
+    CropVideoResponse,
+    DownloadVideoRequest,
+    DownloadVideoResponse,
+    StartSubclipRequest,
+    StartSubclipResponse
+)
+from app.models.schema import VideoClipParams
+from app.services.script_service import ScriptGenerator
+from app.services.video_service import VideoService
+from app.utils import utils
+from app.controllers.v2.base import v2_router
+from app.models.schema import VideoClipParams
+from app.services.youtube_service import YoutubeService
+from app.services import task as task_service
+
+router = v2_router()
+
+
+@router.post(
+    "/scripts/generate",
+    response_model=GenerateScriptResponse,
+    summary="同步请求；生成视频脚本 (V2)"
+)
+async def generate_script(
+    request: GenerateScriptRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    生成视频脚本的V2版本API
+    """
+    task_id = utils.get_uuid()
+    
+    try:
+        generator = ScriptGenerator()
+        script = await generator.generate_script(
+            video_path=request.video_path,
+            video_theme=request.video_theme,
+            custom_prompt=request.custom_prompt,
+            skip_seconds=request.skip_seconds,
+            threshold=request.threshold,
+            vision_batch_size=request.vision_batch_size,
+            vision_llm_provider=request.vision_llm_provider
+        )
+        
+        return {
+            "task_id": task_id,
+            "script": script
+        }
+        
+    except Exception as e:
+        logger.exception(f"Generate script failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/scripts/crop",
+    response_model=CropVideoResponse,
+    summary="同步请求；裁剪视频 (V2)"
+)
+async def crop_video(
+    request: CropVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    根据脚本裁剪视频的V2版本API
+    """
+    try:
+        # 调用视频裁剪服务
+        video_service = VideoService()
+        task_id, subclip_videos = await video_service.crop_video(
+            video_path=request.video_origin_path,
+            video_script=request.video_script
+        )
+        logger.debug(f"裁剪视频成功，视频片段路径: {subclip_videos}")
+        logger.debug(type(subclip_videos))
+        return {
+            "task_id": task_id,
+            "subclip_videos": subclip_videos
+        }
+        
+    except Exception as e:
+        logger.exception(f"Crop video failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/youtube/download",
+    response_model=DownloadVideoResponse,
+    summary="同步请求；下载YouTube视频 (V2)"
+)
+async def download_youtube_video(
+    request: DownloadVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    下载指定分辨率的YouTube视频
+    """
+    try:
+        youtube_service = YoutubeService()
+        task_id, output_path, filename = await youtube_service.download_video(
+            url=request.url,
+            resolution=request.resolution,
+            output_format=request.output_format,
+            rename=request.rename
+        )
+        
+        return {
+            "task_id": task_id,
+            "output_path": output_path,
+            "resolution": request.resolution,
+            "format": request.output_format,
+            "filename": filename
+        }
+        
+    except Exception as e:
+        logger.exception(f"Download YouTube video failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/scripts/start-subclip",
+    response_model=StartSubclipResponse,
+    summary="异步请求；开始视频剪辑任务 (V2)"
+)
+async def start_subclip(
+    request: VideoClipParams,
+    task_id: str,
+    subclip_videos: dict,
+    background_tasks: BackgroundTasks
+):
+    """
+    开始视频剪辑任务的V2版本API
+    """
+    try:
+        # 构建参数对象
+        params = VideoClipParams(
+            video_origin_path=request.video_origin_path,
+            video_clip_json_path=request.video_clip_json_path,
+            voice_name=request.voice_name,
+            voice_rate=request.voice_rate,
+            voice_pitch=request.voice_pitch,
+            subtitle_enabled=request.subtitle_enabled,
+            video_aspect=request.video_aspect,
+            n_threads=request.n_threads
+        )
+        
+        # 在后台任务中执行视频剪辑
+        background_tasks.add_task(
+            task_service.start_subclip,
+            task_id=task_id,
+            params=params,
+            subclip_path_videos=subclip_videos
+        )
+        
+        return {
+            "task_id": task_id,
+            "state": "PROCESSING"  # 初始状态
+        }
+        
+    except Exception as e:
+        logger.exception(f"Start subclip task failed: {str(e)}")
+        raise
diff --git a/app/models/__init__.py b/app/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/app/models/const.py b/app/models/const.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7540ef34c134a3687d5d912b69e895b28c772d4
--- /dev/null
+++ b/app/models/const.py
@@ -0,0 +1,25 @@
+PUNCTUATIONS = [
+    "?",
+    ",",
+    ".",
+    "、",
+    ";",
+    ":",
+    "!",
+    "…",
+    "？",
+    "，",
+    "。",
+    "、",
+    "；",
+    "：",
+    "！",
+    "...",
+]
+
+TASK_STATE_FAILED = -1
+TASK_STATE_COMPLETE = 1
+TASK_STATE_PROCESSING = 4
+
+FILE_TYPE_VIDEOS = ["mp4", "mov", "mkv", "webm"]
+FILE_TYPE_IMAGES = ["jpg", "jpeg", "png", "bmp"]
diff --git a/app/models/exception.py b/app/models/exception.py
new file mode 100644
index 0000000000000000000000000000000000000000..b186cae90c05e75134e242aa091c57ab2bade963
--- /dev/null
+++ b/app/models/exception.py
@@ -0,0 +1,28 @@
+import traceback
+from typing import Any
+
+from loguru import logger
+
+
+class HttpException(Exception):
+    def __init__(
+        self, task_id: str, status_code: int, message: str = "", data: Any = None
+    ):
+        self.message = message
+        self.status_code = status_code
+        self.data = data
+        # 获取异常堆栈信息
+        tb_str = traceback.format_exc().strip()
+        if not tb_str or tb_str == "NoneType: None":
+            msg = f"HttpException: {status_code}, {task_id}, {message}"
+        else:
+            msg = f"HttpException: {status_code}, {task_id}, {message}\n{tb_str}"
+
+        if status_code == 400:
+            logger.warning(msg)
+        else:
+            logger.error(msg)
+
+
+class FileNotFoundException(Exception):
+    pass
diff --git a/app/models/schema.py b/app/models/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf0ad11c654ca5511e0969b1abf9ddc2c7e6ce4
--- /dev/null
+++ b/app/models/schema.py
@@ -0,0 +1,391 @@
+import warnings
+from enum import Enum
+from typing import Any, List, Optional
+
+import pydantic
+from pydantic import BaseModel, Field
+
+# 忽略 Pydantic 的特定警告
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="Field name.*shadows an attribute in parent.*",
+)
+
+
+class VideoConcatMode(str, Enum):
+    random = "random"
+    sequential = "sequential"
+
+
+class VideoAspect(str, Enum):
+    landscape = "16:9"
+    landscape_2 = "4:3"
+    portrait = "9:16"
+    portrait_2 = "3:4"
+    square = "1:1"
+
+    def to_resolution(self):
+        if self == VideoAspect.landscape.value:
+            return 1920, 1080
+        elif self == VideoAspect.portrait.value:
+            return 1080, 1920
+        elif self == VideoAspect.square.value:
+            return 1080, 1080
+        return 1080, 1920
+
+
+class _Config:
+    arbitrary_types_allowed = True
+
+
+@pydantic.dataclasses.dataclass(config=_Config)
+class MaterialInfo:
+    provider: str = "pexels"
+    url: str = ""
+    duration: int = 0
+
+
+# VoiceNames = [
+#     # zh-CN
+#     "female-zh-CN-XiaoxiaoNeural",
+#     "female-zh-CN-XiaoyiNeural",
+#     "female-zh-CN-liaoning-XiaobeiNeural",
+#     "female-zh-CN-shaanxi-XiaoniNeural",
+#
+#     "male-zh-CN-YunjianNeural",
+#     "male-zh-CN-YunxiNeural",
+#     "male-zh-CN-YunxiaNeural",
+#     "male-zh-CN-YunyangNeural",
+#
+#     # "female-zh-HK-HiuGaaiNeural",
+#     # "female-zh-HK-HiuMaanNeural",
+#     # "male-zh-HK-WanLungNeural",
+#     #
+#     # "female-zh-TW-HsiaoChenNeural",
+#     # "female-zh-TW-HsiaoYuNeural",
+#     # "male-zh-TW-YunJheNeural",
+#
+#     # en-US
+#     "female-en-US-AnaNeural",
+#     "female-en-US-AriaNeural",
+#     "female-en-US-AvaNeural",
+#     "female-en-US-EmmaNeural",
+#     "female-en-US-JennyNeural",
+#     "female-en-US-MichelleNeural",
+#
+#     "male-en-US-AndrewNeural",
+#     "male-en-US-BrianNeural",
+#     "male-en-US-ChristopherNeural",
+#     "male-en-US-EricNeural",
+#     "male-en-US-GuyNeural",
+#     "male-en-US-RogerNeural",
+#     "male-en-US-SteffanNeural",
+# ]
+
+
+class VideoParams(BaseModel):
+    """
+    {
+      "video_subject": "",
+      "video_aspect": "横屏 16:9（西瓜视频）",
+      "voice_name": "女生-晓晓",
+      "bgm_name": "random",
+      "font_name": "STHeitiMedium 黑体-中",
+      "text_color": "#FFFFFF",
+      "font_size": 60,
+      "stroke_color": "#000000",
+      "stroke_width": 1.5
+    }
+    """
+
+    video_subject: str
+    video_script: str = ""  # 用于生成视频的脚本
+    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
+    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    video_clip_duration: Optional[int] = 5
+    video_count: Optional[int] = 1
+
+    video_source: Optional[str] = "pexels"
+    video_materials: Optional[List[MaterialInfo]] = None  # 用于生成视频的素材
+
+    video_language: Optional[str] = ""  # auto detect
+
+    voice_name: Optional[str] = ""
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.0
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+
+    subtitle_enabled: Optional[bool] = True
+    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    custom_position: float = 70.0
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    n_threads: Optional[int] = 2
+    paragraph_number: Optional[int] = 1
+
+
+class SubtitleRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    subtitle_position: Optional[str] = "bottom"
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    video_source: Optional[str] = "local"
+    subtitle_enabled: Optional[str] = "true"
+
+
+class AudioRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    video_source: Optional[str] = "local"
+
+
+class VideoScriptParams:
+    """
+    {
+      "video_subject": "春天的花海",
+      "video_language": "",
+      "paragraph_number": 1
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_language: Optional[str] = ""
+    paragraph_number: Optional[int] = 1
+
+
+class VideoTermsParams:
+    """
+    {
+      "video_subject": "",
+      "video_script": "",
+      "amount": 5
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_script: Optional[str] = (
+        "春天的花海，如诗如画般展现在眼前。万物复苏的季节里，大地披上了一袭绚丽多彩的盛装。金黄的迎春、粉嫩的樱花、洁白的梨花、艳丽的郁金香……"
+    )
+    amount: Optional[int] = 5
+
+
+class BaseResponse(BaseModel):
+    status: int = 200
+    message: Optional[str] = "success"
+    data: Any = None
+
+
+class TaskVideoRequest(VideoParams, BaseModel):
+    pass
+
+
+class TaskQueryRequest(BaseModel):
+    pass
+
+
+class VideoScriptRequest(VideoScriptParams, BaseModel):
+    pass
+
+
+class VideoTermsRequest(VideoTermsParams, BaseModel):
+    pass
+
+
+######################################################################################################
+######################################################################################################
+######################################################################################################
+######################################################################################################
+class TaskResponse(BaseResponse):
+    class TaskResponseData(BaseModel):
+        task_id: str
+
+    data: TaskResponseData
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"task_id": "6c85c8cc-a77a-42b9-bc30-947815aa0558"},
+            },
+        }
+
+
+class TaskQueryResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class TaskDeletionResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class VideoScriptResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "video_script": "春天的花海，是大自然的一幅美丽画卷。在这个季节里，大地复苏，万物生长，花朵争相绽放，形成了一片五彩斑斓的花海..."
+                },
+            },
+        }
+
+
+class VideoTermsResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"video_terms": ["sky", "tree"]},
+            },
+        }
+
+
+class BgmRetrieveResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "files": [
+                        {
+                            "name": "output013.mp3",
+                            "size": 1891269,
+                            "file": "/NarratoAI/resource/songs/output013.mp3",
+                        }
+                    ]
+                },
+            },
+        }
+
+
+class BgmUploadResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"file": "/NarratoAI/resource/songs/example.mp3"},
+            },
+        }
+
+
+class VideoClipParams(BaseModel):
+    """
+    NarratoAI 数据模型
+    """
+    video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
+    video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
+    video_origin_path: Optional[str] = Field(default="", description="原视频路径")
+    video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
+    video_language: Optional[str] = Field(default="zh-CN", description="视频语言")
+
+    # video_clip_duration: Optional[int] = 5      # 视频片段时长
+    # video_count: Optional[int] = 1      # 视频片段数量
+    # video_source: Optional[str] = "local"
+    # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+
+    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
+    voice_volume: Optional[float] = Field(default=1.0, description="解说语音音量")
+    voice_rate: Optional[float] = Field(default=1.0, description="语速")
+    voice_pitch: Optional[float] = Field(default=1.0, description="语调")
+
+    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
+    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
+    bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
+
+    subtitle_enabled: bool = True
+    font_name: str = "SimHei"  # 默认使用黑体
+    font_size: int = 36
+    text_fore_color: str = "white"              # 文本前景色
+    text_back_color: Optional[str] = None       # 文本背景色
+    stroke_color: str = "black"                 # 描边颜色
+    stroke_width: float = 1.5                   # 描边宽度
+    subtitle_position: str = "bottom"   # top, bottom, center, custom
+    custom_position: float = 70.0       # 自定义位置
+
+    n_threads: Optional[int] = Field(default=16, description="线程数")    # 线程数，有助于提升视频处理速度
+
+    tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量（后处理）")
+    original_volume: Optional[float] = Field(default=1.0, description="视频原声音量")
+    bgm_volume: Optional[float] = Field(default=0.3, description="背景音乐音量")
+
+
+class VideoTranscriptionRequest(BaseModel):
+    video_name: str
+    language: str = "zh-CN"
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class VideoTranscriptionResponse(BaseModel):
+    transcription: str
+
+
+class SubtitlePosition(str, Enum):
+    TOP = "top"
+    CENTER = "center"
+    BOTTOM = "bottom"
+
diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8584c7548664ff7de8447d29aaa5b8c911e71d15
--- /dev/null
+++ b/app/models/schema_v2.py
@@ -0,0 +1,63 @@
+from typing import Optional, List
+from pydantic import BaseModel
+
+
+class GenerateScriptRequest(BaseModel):
+    video_path: str
+    video_theme: Optional[str] = ""
+    custom_prompt: Optional[str] = ""
+    frame_interval_input: Optional[int] = 5
+    skip_seconds: Optional[int] = 0
+    threshold: Optional[int] = 30
+    vision_batch_size: Optional[int] = 5
+    vision_llm_provider: Optional[str] = "gemini"
+
+
+class GenerateScriptResponse(BaseModel):
+    task_id: str
+    script: List[dict]
+
+
+class CropVideoRequest(BaseModel):
+    video_origin_path: str
+    video_script: List[dict]
+
+
+class CropVideoResponse(BaseModel):
+    task_id: str
+    subclip_videos: dict
+
+
+class DownloadVideoRequest(BaseModel):
+    url: str
+    resolution: str
+    output_format: Optional[str] = "mp4"
+    rename: Optional[str] = None
+
+
+class DownloadVideoResponse(BaseModel):
+    task_id: str
+    output_path: str
+    resolution: str
+    format: str
+    filename: str
+
+
+class StartSubclipRequest(BaseModel):
+    task_id: str
+    video_origin_path: str
+    video_clip_json_path: str
+    voice_name: Optional[str] = None
+    voice_rate: Optional[int] = 0
+    voice_pitch: Optional[int] = 0
+    subtitle_enabled: Optional[bool] = True
+    video_aspect: Optional[str] = "16:9"
+    n_threads: Optional[int] = 4
+    subclip_videos: list  # 从裁剪视频接口获取的视频片段字典
+
+
+class StartSubclipResponse(BaseModel):
+    task_id: str
+    state: str
+    videos: Optional[List[str]] = None
+    combined_videos: Optional[List[str]] = None
diff --git a/app/router.py b/app/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..df60500e8c1f9907254939b609a20434bfcb3307
--- /dev/null
+++ b/app/router.py
@@ -0,0 +1,21 @@
+"""Application configuration - root APIRouter.
+
+Defines all FastAPI application endpoints.
+
+Resources:
+    1. https://fastapi.tiangolo.com/tutorial/bigger-applications
+
+"""
+
+from fastapi import APIRouter
+
+from app.controllers.v1 import llm, video
+from app.controllers.v2 import script
+
+root_api_router = APIRouter()
+# v1
+root_api_router.include_router(video.router)
+root_api_router.include_router(llm.router)
+
+# v2
+root_api_router.include_router(script.router)
diff --git a/app/services/SDE/prompt.py b/app/services/SDE/prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..78385ccc3a33e0944d2b210761c4f29dd40ec086
--- /dev/null
+++ b/app/services/SDE/prompt.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : prompt
+@Author : 小林同学
+@Date   : 2025/5/9 上午12:57 
+'''
+# 字幕剧情分析提示词
+subtitle_plot_analysis_v1 = """
+# 角色
+你是一位专业的剧本分析师和剧情概括助手。
+
+# 任务
+我将为你提供一部短剧的完整字幕文本。请你基于这些字幕，完成以下任务：
+1.  **整体剧情分析**：简要概括整个短剧的核心剧情脉络、主要冲突和结局（如果有的话）。
+2.  **分段剧情解析与时间戳定位**：
+    *   将整个短剧划分为若干个关键的剧情段落（例如：开端、发展、转折、高潮、结局，或根据具体情节自然划分）。
+    *   段落数应该与字幕长度成正比。
+    *   对于每一个剧情段落：
+        *   **概括该段落的主要内容**：用简洁的语言描述这段剧情发生了什么。
+        *   **标注对应的时间戳范围**：明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳。请直接从字幕中提取时间信息。
+
+# 输入格式
+字幕内容通常包含时间戳和对话，例如：
+```
+00:00:05,000 --> 00:00:10,000
+[角色A]: 你好吗？
+00:00:10,500 --> 00:00:15,000
+[角色B]: 我很好，谢谢。发生了一些有趣的事情。
+... (更多字幕内容) ...
+```
+我将把实际字幕粘贴在下方。
+
+# 输出格式要求
+请按照以下格式清晰地呈现分析结果：
+
+**一、整体剧情概括：**
+[此处填写对整个短剧剧情的概括]
+
+**二、分段剧情解析：**
+
+**剧情段落 1：[段落主题/概括，例如：主角登场与背景介绍]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+**剧情段落 2：[段落主题/概括，例如：第一个冲突出现]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+... (根据实际剧情段落数量继续) ...
+
+**剧情段落 N：[段落主题/概括，例如：结局与反思]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+# 注意事项
+*   请确保时间戳的准确性，直接引用字幕中的时间。
+*   剧情段落的划分应合乎逻辑，能够反映剧情的起承转合。
+*   语言表达应简洁、准确、客观。
+
+# 限制
+1. 严禁输出与分析结果无关的内容
+2. 
+
+# 请处理以下字幕：
+"""
+
+plot_writing = """
+我是一个影视解说up主，需要为我的粉丝讲解短剧《%s》的剧情，目前正在解说剧情，希望能让粉丝通过我的解说了解剧情，并且产生 继续观看的兴趣，请生成一篇解说脚本，包含解说文案，以及穿插原声的片段，下面<plot>中的内容是短剧的剧情概述：
+
+<plot>
+%s
+</plot>
+
+请使用 json 格式进行输出；使用 <output> 中的输出格式：
+<output>
+{
+  "items": [
+    {
+        "_id": 1, # 唯一递增id
+        "timestamp": "00:00:05,390-00:00:10,430",
+        "picture": "剧情描述或者备注",
+        "narration": "解说文案，如果片段为穿插的原片片段，可以直接使用 ‘播放原片+_id‘ 进行占位",
+        "OST": "值为 0 表示当前片段为解说片段，值为 1 表示当前片段为穿插的原片"
+    }
+}
+</output>
+
+<restriction>
+1. 只输出 json 内容，不要输出其他任何说明性的文字
+2. 解说文案的语言使用 简体中文
+3. 严禁虚构剧情，所有画面只能从 <polt> 中摘取
+4. 严禁虚构时间戳，所有时间戳范围只能从 <polt> 中摘取
+</restriction>
+"""
\ No newline at end of file
diff --git a/app/services/SDE/short_drama_explanation.py b/app/services/SDE/short_drama_explanation.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a460dc23af685558220ab8f31be38f75781f2e
--- /dev/null
+++ b/app/services/SDE/short_drama_explanation.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 短剧解说
+@Author : 小林同学
+@Date   : 2025/5/9 上午12:36 
+'''
+
+import os
+import json
+import requests
+from typing import Dict, Any, Optional
+from loguru import logger
+from app.config import config
+from app.utils.utils import get_uuid, storage_dir
+from app.services.SDE.prompt import subtitle_plot_analysis_v1, plot_writing
+
+
+class SubtitleAnalyzer:
+    """字幕剧情分析器，负责分析字幕内容并提取关键剧情段落"""
+    
+    def __init__(
+        self, 
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        base_url: Optional[str] = None,
+        custom_prompt: Optional[str] = None,
+        temperature: Optional[float] = 1.0,
+    ):
+        """
+        初始化字幕分析器
+        
+        Args:
+            api_key: API密钥，如果不提供则从配置中读取
+            model: 模型名称，如果不提供则从配置中读取
+            base_url: API基础URL，如果不提供则从配置中读取或使用默认值
+            custom_prompt: 自定义提示词，如果不提供则使用默认值
+            temperature: 模型温度
+        """
+        # 使用传入的参数或从配置中获取
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.temperature = temperature
+        
+        # 设置提示词模板
+        self.prompt_template = custom_prompt or subtitle_plot_analysis_v1
+        
+        # 初始化HTTP请求所需的头信息
+        self._init_headers()
+    
+    def _init_headers(self):
+        """初始化HTTP请求头"""
+        try:
+            # 基础请求头，包含API密钥和内容类型
+            self.headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}"
+            }
+            # logger.debug(f"初始化成功 - API Key: {self.api_key[:8]}... - Base URL: {self.base_url}")
+        except Exception as e:
+            logger.error(f"初始化请求头失败: {str(e)}")
+            raise
+    
+    def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]:
+        """
+        分析字幕内容
+        
+        Args:
+            subtitle_content: 字幕内容文本
+            
+        Returns:
+            Dict[str, Any]: 包含分析结果的字典
+        """
+        try:
+            # 构建完整提示词
+            prompt = f"{self.prompt_template}\n\n{subtitle_content}"
+            
+            # 构建请求体数据
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": "你是一位专业的剧本分析师和剧情概括助手。"},
+                    {"role": "user", "content": prompt}
+                ],
+                "temperature": self.temperature
+            }
+            
+            # 构建请求地址
+            url = f"{self.base_url}/chat/completions"
+            
+            # 发送HTTP请求
+            response = requests.post(url, headers=self.headers, json=payload)
+            
+            # 解析响应
+            if response.status_code == 200:
+                response_data = response.json()
+                
+                # 提取响应内容
+                if "choices" in response_data and len(response_data["choices"]) > 0:
+                    analysis_result = response_data["choices"][0]["message"]["content"]
+                    logger.debug(f"字幕分析完成，消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}")
+                    
+                    # 返回结果
+                    return {
+                        "status": "success",
+                        "analysis": analysis_result,
+                        "tokens_used": response_data.get("usage", {}).get("total_tokens", 0),
+                        "model": self.model,
+                        "temperature": self.temperature
+                    }
+                else:
+                    logger.error("字幕分析失败: 未获取到有效响应")
+                    return {
+                        "status": "error",
+                        "message": "未获取到有效响应",
+                        "temperature": self.temperature
+                    }
+            else:
+                error_msg = f"请求失败，状态码: {response.status_code}, 响应: {response.text}"
+                logger.error(error_msg)
+                return {
+                    "status": "error",
+                    "message": error_msg,
+                    "temperature": self.temperature
+                }
+                
+        except Exception as e:
+            logger.error(f"字幕分析过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+    
+    def analyze_subtitle_from_file(self, subtitle_file_path: str) -> Dict[str, Any]:
+        """
+        从文件读取字幕并分析
+        
+        Args:
+            subtitle_file_path: 字幕文件的路径
+            
+        Returns:
+            Dict[str, Any]: 包含分析结果的字典
+        """
+        try:
+            # 检查文件是否存在
+            if not os.path.exists(subtitle_file_path):
+                return {
+                    "status": "error",
+                    "message": f"字幕文件不存在: {subtitle_file_path}",
+                    "temperature": self.temperature
+                }
+            
+            # 读取文件内容
+            with open(subtitle_file_path, 'r', encoding='utf-8') as f:
+                subtitle_content = f.read()
+            
+            # 分析字幕
+            return self.analyze_subtitle(subtitle_content)
+            
+        except Exception as e:
+            logger.error(f"从文件读取字幕并分析过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+
+    def save_analysis_result(self, analysis_result: Dict[str, Any], output_path: Optional[str] = None) -> str:
+        """
+        保存分析结果到文件
+        
+        Args:
+            analysis_result: 分析结果
+            output_path: 输出文件路径，如果不提供则自动生成
+            
+        Returns:
+            str: 输出文件的路径
+        """
+        try:
+            # 如果未提供输出路径，则自动生成
+            if not output_path:
+                output_dir = storage_dir("drama_analysis", create=True)
+                output_path = os.path.join(output_dir, f"analysis_{get_uuid(True)}.txt")
+            
+            # 确保目录存在
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            
+            # 保存结果
+            with open(output_path, 'w', encoding='utf-8') as f:
+                if analysis_result["status"] == "success":
+                    f.write(analysis_result["analysis"])
+                else:
+                    f.write(f"分析失败: {analysis_result['message']}")
+            
+            logger.info(f"分析结果已保存到: {output_path}")
+            return output_path
+            
+        except Exception as e:
+            logger.error(f"保存分析结果时发生错误: {str(e)}")
+            return ""
+
+    def generate_narration_script(self, short_name:str, plot_analysis: str, temperature: float = 0.7) -> Dict[str, Any]:
+        """
+        根据剧情分析生成解说文案
+        
+        Args:
+            short_name: 短剧名称
+            plot_analysis: 剧情分析内容
+            temperature: 生成温度，控制创造性，默认0.7
+            
+        Returns:
+            Dict[str, Any]: 包含生成结果的字典
+        """
+        try:
+            # 构建完整提示词
+            prompt = plot_writing % (short_name, plot_analysis)
+
+            # 构建请求体数据
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": "你是一位专业的短视频解说脚本撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                "temperature": temperature
+            }
+            
+            # 对特定模型添加响应格式设置
+            if self.model not in ["deepseek-reasoner"]:
+                payload["response_format"] = {"type": "json_object"}
+            
+            # 构建请求地址
+            url = f"{self.base_url}/chat/completions"
+            
+            # 发送HTTP请求
+            response = requests.post(url, headers=self.headers, json=payload)
+            
+            # 解析响应
+            if response.status_code == 200:
+                response_data = response.json()
+                
+                # 提取响应内容
+                if "choices" in response_data and len(response_data["choices"]) > 0:
+                    narration_script = response_data["choices"][0]["message"]["content"]
+                    logger.debug(f"解说文案生成完成，消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}")
+                    
+                    # 返回结果
+                    return {
+                        "status": "success",
+                        "narration_script": narration_script,
+                        "tokens_used": response_data.get("usage", {}).get("total_tokens", 0),
+                        "model": self.model,
+                        "temperature": self.temperature
+                    }
+                else:
+                    logger.error("解说文案生成失败: 未获取到有效响应")
+                    return {
+                        "status": "error",
+                        "message": "未获取到有效响应",
+                        "temperature": self.temperature
+                    }
+            else:
+                error_msg = f"请求失败，状态码: {response.status_code}, 响应: {response.text}"
+                logger.error(error_msg)
+                return {
+                    "status": "error",
+                    "message": error_msg,
+                    "temperature": self.temperature
+                }
+                
+        except Exception as e:
+            logger.error(f"解说文案生成过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+    
+    def save_narration_script(self, narration_result: Dict[str, Any], output_path: Optional[str] = None) -> str:
+        """
+        保存解说文案到文件
+        
+        Args:
+            narration_result: 解说文案生成结果
+            output_path: 输出文件路径，如果不提供则自动生成
+            
+        Returns:
+            str: 输出文件的路径
+        """
+        try:
+            # 如果未提供输出路径，则自动生成
+            if not output_path:
+                output_dir = storage_dir("narration_scripts", create=True)
+                output_path = os.path.join(output_dir, f"narration_{get_uuid(True)}.json")
+            
+            # 确保目录存在
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            
+            # 保存结果
+            with open(output_path, 'w', encoding='utf-8') as f:
+                if narration_result["status"] == "success":
+                    f.write(narration_result["narration_script"])
+                else:
+                    f.write(f"生成失败: {narration_result['message']}")
+            
+            logger.info(f"解说文案已保存到: {output_path}")
+            return output_path
+            
+        except Exception as e:
+            logger.error(f"保存解说文案时发生错误: {str(e)}")
+            return ""
+
+
+def analyze_subtitle(
+        subtitle_content: str = None,
+        subtitle_file_path: str = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        base_url: Optional[str] = None,
+        custom_prompt: Optional[str] = None,
+        temperature: float = 1.0,
+        save_result: bool = False,
+        output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    分析字幕内容的便捷函数
+    
+    Args:
+        subtitle_content: 字幕内容文本
+        subtitle_file_path: 字幕文件路径
+        custom_prompt: 自定义提示词
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        temperature: 模型温度
+        save_result: 是否保存结果到文件
+        output_path: 输出文件路径
+        
+    Returns:
+        Dict[str, Any]: 包含分析结果的字典
+    """
+    # 初始化分析器
+    analyzer = SubtitleAnalyzer(
+        temperature=temperature,
+        api_key=api_key,
+        model=model,
+        base_url=base_url,
+        custom_prompt=custom_prompt
+    )
+    logger.debug(f"使用模型: {analyzer.model} 开始分析, 温度: {analyzer.temperature}")
+    # 分析字幕
+    if subtitle_content:
+        result = analyzer.analyze_subtitle(subtitle_content)
+    elif subtitle_file_path:
+        result = analyzer.analyze_subtitle_from_file(subtitle_file_path)
+    else:
+        return {
+            "status": "error",
+            "message": "必须提供字幕内容或字幕文件路径",
+            "temperature": temperature
+        }
+    
+    # 保存结果
+    if save_result and result["status"] == "success":
+        result["output_path"] = analyzer.save_analysis_result(result, output_path)
+    
+    return result
+
+
+def generate_narration_script(
+    short_name: str = None,
+    plot_analysis: str = None,
+    api_key: Optional[str] = None,
+    model: Optional[str] = None,
+    base_url: Optional[str] = None,
+    temperature: float = 1.0,
+    save_result: bool = False,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    根据剧情分析生成解说文案的便捷函数
+    
+    Args:
+        short_name: 短剧名称
+        plot_analysis: 剧情分析内容，直接提供
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        temperature: 生成温度，控制创造性
+        save_result: 是否保存结果到文件
+        output_path: 输出文件路径
+        
+    Returns:
+        Dict[str, Any]: 包含生成结果的字典
+    """
+    # 初始化分析器
+    analyzer = SubtitleAnalyzer(
+        temperature=temperature,
+        api_key=api_key,
+        model=model,
+        base_url=base_url
+    )
+    
+    # 生成解说文案
+    result = analyzer.generate_narration_script(short_name, plot_analysis, temperature)
+    
+    # 保存结果
+    if save_result and result["status"] == "success":
+        result["output_path"] = analyzer.save_narration_script(result, output_path)
+    
+    return result
+
+
+if __name__ == '__main__':
+    text_api_key = "skxxxx"
+    text_model = "gemini-2.0-flash"
+    text_base_url = "https://api.narratoai.cn/v1/chat/completions"  # 确保URL不以斜杠结尾，便于后续拼接
+    subtitle_path = "/Users/apple/Desktop/home/NarratoAI/resource/srt/家里家外1-5.srt"
+    
+    # 示例用法
+    if subtitle_path:
+        # 分析字幕总结剧情
+        analysis_result = analyze_subtitle(
+            subtitle_file_path=subtitle_path,
+            api_key=text_api_key,
+            model=text_model,
+            base_url=text_base_url,
+            save_result=True
+        )
+        
+        if analysis_result["status"] == "success":
+            print("字幕分析成功！")
+            print("分析结果：")
+            print(analysis_result["analysis"])
+            
+            # 根据剧情生成解说文案
+            narration_result = generate_narration_script(
+                plot_analysis=analysis_result["analysis"],
+                api_key=text_api_key,
+                model=text_model,
+                base_url=text_base_url,
+                save_result=True
+            )
+            
+            if narration_result["status"] == "success":
+                print("\n解说文案生成成功！")
+                print("解说文案：")
+                print(narration_result["narration_script"])
+            else:
+                print(f"\n解说文案生成失败: {narration_result['message']}")
+        else:
+            print(f"分析失败: {analysis_result['message']}")
diff --git a/app/services/SDP/generate_script_short.py b/app/services/SDP/generate_script_short.py
new file mode 100644
index 0000000000000000000000000000000000000000..caaad93cfe36b2ea7c71c4af1f3a5a16d8071793
--- /dev/null
+++ b/app/services/SDP/generate_script_short.py
@@ -0,0 +1,37 @@
+"""
+视频脚本生成pipeline，串联各个处理步骤
+"""
+import os
+from .utils.step1_subtitle_analyzer_openai import analyze_subtitle
+from .utils.step5_merge_script import merge_script
+
+
+def generate_script(srt_path: str, api_key: str, model_name: str, output_path: str, base_url: str = None, custom_clips: int = 5):
+    """生成视频混剪脚本
+
+    Args:
+        srt_path: 字幕文件路径
+        output_path: 输出文件路径，可选
+
+    Returns:
+        str: 生成的脚本内容
+    """
+    # 验证输入文件
+    if not os.path.exists(srt_path):
+        raise FileNotFoundError(f"字幕文件不存在: {srt_path}")
+
+    # 分析字幕
+    print("开始分析...")
+    openai_analysis = analyze_subtitle(
+        srt_path=srt_path,
+        api_key=api_key,
+        model_name=model_name,
+        base_url=base_url,
+        custom_clips=custom_clips
+    )
+
+    # 合并生成最终脚本
+    adjusted_results = openai_analysis['plot_points']
+    final_script = merge_script(adjusted_results, output_path)
+
+    return final_script
diff --git a/app/services/SDP/utils/short_schema.py b/app/services/SDP/utils/short_schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..f316a89aa1dabc315924cb6182543092d58e3dd2
--- /dev/null
+++ b/app/services/SDP/utils/short_schema.py
@@ -0,0 +1,60 @@
+"""
+定义项目中使用的数据类型
+"""
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class PlotPoint:
+    timestamp: str
+    title: str
+    picture: str
+
+
+@dataclass
+class Commentary:
+    timestamp: str
+    title: str
+    copywriter: str
+
+
+@dataclass
+class SubtitleSegment:
+    start_time: float
+    end_time: float
+    text: str
+
+
+@dataclass
+class ScriptItem:
+    timestamp: str
+    title: str
+    picture: str
+    copywriter: str
+
+
+@dataclass
+class PipelineResult:
+    output_video_path: str
+    plot_points: List[PlotPoint]
+    subtitle_segments: List[SubtitleSegment]
+    commentaries: List[Commentary]
+    final_script: List[ScriptItem]
+    error: Optional[str] = None
+
+
+class VideoProcessingError(Exception):
+    pass
+
+
+class SubtitleProcessingError(Exception):
+    pass
+
+
+class PlotAnalysisError(Exception):
+    pass
+
+
+class CopywritingError(Exception):
+    pass
diff --git a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ea3b0faa6a57e900b7841d19bd1a817c453d81
--- /dev/null
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
@@ -0,0 +1,157 @@
+"""
+使用OpenAI API，分析字幕文件，返回剧情梗概和爆点
+"""
+import traceback
+from openai import OpenAI, BadRequestError
+import os
+import json
+
+from .utils import load_srt
+
+
+def analyze_subtitle(
+    srt_path: str,
+    model_name: str,
+    api_key: str = None,
+    base_url: str = None,
+    custom_clips: int = 5
+) -> dict:
+    """分析字幕内容，返回完整的分析结果
+
+    Args:
+        srt_path (str): SRT字幕文件路径
+        api_key (str, optional): 大模型API密钥. Defaults to None.
+        model_name (str, optional): 大模型名称. Defaults to "gpt-4o-2024-11-20".
+        base_url (str, optional): 大模型API基础URL. Defaults to None.
+
+    Returns:
+        dict: 包含剧情梗概和结构化的时间段分析的字典
+    """
+    try:
+        # 加载字幕文件
+        subtitles = load_srt(srt_path)
+        subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
+
+        # 初始化客户端
+        global client
+        if "deepseek" in model_name.lower():
+            client = OpenAI(
+                api_key=api_key or os.getenv('DeepSeek_API_KEY'),
+                base_url="https://api.siliconflow.cn/v1"    # 使用第三方 硅基流动 API
+            )
+        else:
+            client = OpenAI(
+                api_key=api_key or os.getenv('OPENAI_API_KEY'),
+                base_url=base_url
+            )
+
+        messages = [
+            {
+                "role": "system",
+                "content": """你是一名经验丰富的短剧编剧，擅长根据字幕内容按照先后顺序分析关键剧情,并找出 %s 个关键片段。
+                请返回一个JSON对象，包含以下字段：
+                {
+                    "summary": "整体剧情梗概",
+                    "plot_titles": [
+                        "关键剧情1",
+                        "关键剧情2",
+                        "关键剧情3",
+                        "关键剧情4",
+                        "关键剧情5",
+                        "..."
+                    ]
+                }
+                请确保返回的是合法的JSON格式, 请确保返回的是 %s 个片段。
+                """ % (custom_clips, custom_clips)
+            },
+            {
+                "role": "user",
+                "content": f"srt字幕如下：{subtitle_content}"
+            }
+        ]
+        # DeepSeek R1 和 V3 不支持 response_format=json_object
+        try:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                response_format={"type": "json_object"}
+            )
+            summary_data = json.loads(completion.choices[0].message.content)
+        except BadRequestError as e:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages
+            )
+            # 去除 completion 字符串前的 ```json 和 结尾的 ```
+            completion = completion.choices[0].message.content.replace("```json", "").replace("```", "")
+            summary_data = json.loads(completion)
+        except Exception as e:
+            raise Exception(f"大模型解析发生错误：{str(e)}\n{traceback.format_exc()}")
+
+        print(json.dumps(summary_data, indent=4, ensure_ascii=False))
+
+        # 获取爆点时间段分析
+        prompt = f"""剧情梗概：
+            {summary_data['summary']}
+
+            需要定位的爆点内容：
+            """
+        print(f"找到 {len(summary_data['plot_titles'])} 个片段")
+        for i, point in enumerate(summary_data['plot_titles'], 1):
+            prompt += f"{i}. {point}\n"
+
+        messages = [
+            {
+                "role": "system",
+                "content": """你是一名短剧编剧，非常擅长根据字幕中分析视频中关键剧情出现的具体时间段。
+                请仔细阅读剧情梗概和爆点内容，然后在字幕中找出每个爆点发生的具体时间段和爆点前后的详细剧情。
+                
+                请返回一个JSON对象，包含一个名为"plot_points"的数组，数组中包含多个对象，每个对象都要包含以下字段：
+                {
+                    "plot_points": [
+                        {
+                            "timestamp": "时间段，格式为xx:xx:xx,xxx-xx:xx:xx,xxx",
+                            "title": "关键剧情的主题",
+                            "picture": "关键剧情前后的详细剧情描述"
+                        }
+                    ]
+                }
+                请确保返回的是合法的JSON格式。"""
+            },
+            {
+                "role": "user",
+                "content": f"""字幕内容：
+{subtitle_content}
+
+{prompt}"""
+            }
+        ]
+        # DeepSeek R1 和 V3 不支持 response_format=json_object
+        try:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                response_format={"type": "json_object"}
+            )
+            plot_points_data = json.loads(completion.choices[0].message.content)
+        except BadRequestError as e:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages
+            )
+            # 去除 completion 字符串前的 ```json 和 结尾的 ```
+            completion = completion.choices[0].message.content.replace("```json", "").replace("```", "")
+            plot_points_data = json.loads(completion)
+        except Exception as e:
+            raise Exception(f"大模型解析错误：{str(e)}\n{traceback.format_exc()}")
+
+        print(json.dumps(plot_points_data, indent=4, ensure_ascii=False))
+
+        # 合并结果
+        return {
+            "plot_summary": summary_data,
+            "plot_points": plot_points_data["plot_points"]
+        }
+
+    except Exception as e:
+        raise Exception(f"分析字幕时发生错误：{str(e)}\n{traceback.format_exc()}")
diff --git a/app/services/SDP/utils/step5_merge_script.py b/app/services/SDP/utils/step5_merge_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e57209f5163671b9f6b601a53b9fead6b0914f
--- /dev/null
+++ b/app/services/SDP/utils/step5_merge_script.py
@@ -0,0 +1,69 @@
+"""
+合并生成最终脚本
+"""
+import os
+import json
+from typing import List, Dict, Tuple
+
+
+def merge_script(
+        plot_points: List[Dict],
+        output_path: str
+):
+    """合并生成最终脚本
+
+    Args:
+        plot_points: 校对后的剧情点
+        output_path: 输出文件路径，如果提供则保存到文件
+
+    Returns:
+        str: 最终合并的脚本
+    """
+    def parse_timestamp(ts: str) -> Tuple[float, float]:
+        """解析时间戳，返回开始和结束时间（秒）"""
+        start, end = ts.split('-')
+
+        def parse_time(time_str: str) -> float:
+            time_str = time_str.strip()
+            if ',' in time_str:
+                time_parts, ms_parts = time_str.split(',')
+                ms = float(ms_parts) / 1000
+            else:
+                time_parts = time_str
+                ms = 0
+
+            hours, minutes, seconds = map(int, time_parts.split(':'))
+            return hours * 3600 + minutes * 60 + seconds + ms
+
+        return parse_time(start), parse_time(end)
+
+    def format_timestamp(seconds: float) -> str:
+        """将秒数转换为时间戳格式 HH:MM:SS"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+
+    # 创建包含所有信息的临时列表
+    final_script = []
+
+    # 处理原生画面条目
+    number = 1
+    for plot_point in plot_points:
+        start, end = parse_timestamp(plot_point["timestamp"])
+        script_item = {
+            "_id": number,
+            "timestamp": plot_point["timestamp"],
+            "picture": plot_point["picture"],
+            "narration": f"播放原生_{os.urandom(4).hex()}",
+            "OST": 1,  # OST=0 仅保留解说 OST=2 保留解说和原声
+        }
+        final_script.append(script_item)
+        number += 1
+
+    # 保存结果
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_script, f, ensure_ascii=False, indent=4)
+
+    print(f"脚本生成完成：{output_path}")
+    return final_script
diff --git a/app/services/SDP/utils/utils.py b/app/services/SDP/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..292d5e0aeb83f8e8ce1c74248bfe81f27680070f
--- /dev/null
+++ b/app/services/SDP/utils/utils.py
@@ -0,0 +1,45 @@
+# 公共方法
+import json
+import requests  # 新增
+from typing import List, Dict
+
+
+def load_srt(file_path: str) -> List[Dict]:
+    """加载并解析SRT文件
+
+    Args:
+        file_path: SRT文件路径
+
+    Returns:
+        字幕内容列表
+    """
+    with open(file_path, 'r', encoding='utf-8-sig') as f:
+        content = f.read().strip()
+
+    # 按空行分割字幕块
+    subtitle_blocks = content.split('\n\n')
+    subtitles = []
+
+    for block in subtitle_blocks:
+        lines = block.split('\n')
+        if len(lines) >= 3:  # 确保块包含足够的行
+            try:
+                number = int(lines[0].strip())
+                timestamp = lines[1]
+                text = ' '.join(lines[2:])
+
+                # 解析时间戳
+                start_time, end_time = timestamp.split(' --> ')
+
+                subtitles.append({
+                    'number': number,
+                    'timestamp': timestamp,
+                    'text': text,
+                    'start_time': start_time,
+                    'end_time': end_time
+                })
+            except ValueError as e:
+                print(f"Warning: 跳过无效的字幕块: {e}")
+                continue
+
+    return subtitles
diff --git a/app/services/__init__.py b/app/services/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
new file mode 100644
index 0000000000000000000000000000000000000000..bedb585aa0e5f24f0f5f07876fe3bcd83132958b
--- /dev/null
+++ b/app/services/audio_merger.py
@@ -0,0 +1,171 @@
+import os
+import json
+import subprocess
+import edge_tts
+from edge_tts import submaker
+from pydub import AudioSegment
+from typing import List, Dict
+from loguru import logger
+from app.utils import utils
+
+
+def check_ffmpeg():
+    """检查FFmpeg是否已安装"""
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+def merge_audio_files(task_id: str, total_duration: float, list_script: list):
+    """
+    合并音频文件
+    
+    Args:
+        task_id: 任务ID
+        total_duration: 总时长
+        list_script: 完整脚本信息，包含duration时长和audio路径
+    
+    Returns:
+        str: 合并后的音频文件路径
+    """
+    # 检查FFmpeg是否安装
+    if not check_ffmpeg():
+        logger.error("FFmpeg未安装，无法合并音频文件")
+        return None
+
+    # 创建一个空的音频片段
+    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位
+
+    # 计算每个片段的开始位置（基于duration字段）
+    current_position = 0  # 初始位置（秒）
+    
+    # 遍历脚本中的每个片段
+    for segment in list_script:
+        try:
+            # 获取片段时长（秒）
+            duration = segment['duration']
+            
+            # 检查audio字段是否为空
+            if segment['audio'] and os.path.exists(segment['audio']):
+                # 加载TTS音频文件
+                tts_audio = AudioSegment.from_file(segment['audio'])
+                
+                # 将TTS音频添加到最终音频
+                final_audio = final_audio.overlay(tts_audio, position=current_position * 1000)
+            else:
+                # audio为空，不添加音频，仅保留间隔
+                logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件，保留 {duration} 秒的间隔")
+            
+            # 更新下一个片段的开始位置
+            current_position += duration
+
+        except Exception as e:
+            logger.error(f"处理音频片段时出错: {str(e)}")
+            # 即使处理失败，也要更新位置，确保后续片段位置正确
+            if 'duration' in segment:
+                current_position += segment['duration']
+            continue
+
+    # 保存合并后的音频文件
+    output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3")
+    final_audio.export(output_audio_path, format="mp3")
+    logger.info(f"合并后的音频文件已保存: {output_audio_path}")
+
+    return output_audio_path
+
+
+def time_to_seconds(time_str):
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS,mmm' (分:秒,毫秒)
+    3. 'SS,mmm' (秒,毫秒)
+    """
+    try:
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(int, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = int(parts[0])
+
+        return seconds + ms
+    except (ValueError, IndexError) as e:
+        logger.error(f"Error parsing time {time_str}: {str(e)}")
+        return 0.0
+
+
+def extract_timestamp(filename):
+    """
+    从文件名中提取开始和结束时间戳
+    例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8)
+    """
+    try:
+        # 从文件名中提取时间部分
+        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
+        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
+
+        # 将下划线格式转换回冒号格式
+        start_time = start_time.replace('_', ':')
+        end_time = end_time.replace('_', ':')
+
+        # 将时间戳转换为秒
+        start_seconds = time_to_seconds(start_time)
+        end_seconds = time_to_seconds(end_time)
+
+        return start_seconds, end_seconds
+    except Exception as e:
+        logger.error(f"Error extracting timestamp from {filename}: {str(e)}")
+        return 0.0, 0.0
+
+
+if __name__ == "__main__":
+    # 示例用法
+    total_duration = 90
+
+    video_script = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+         'timestamp': '00:00:00-00:00:26',
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+         'OST': 0, 'duration': 26, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'},
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 'timestamp': '00:01:15-00:01:29',
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+         'OST': 0, 'duration': 14, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'},
+        {'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58',
+         'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+         'OST': 1, 'duration': 17, 
+         'audio': ''},
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+         'timestamp': '00:04:58-00:05:20',
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+         'OST': 0, 'duration': 22, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'},
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'timestamp': '00:05:45-00:05:53',
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'OST': 0, 'duration': 8, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'},
+        {'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03',
+         'narration': '抓刺客',
+         'OST': 1, 'duration': 3, 
+         'audio': ''}]
+
+    output_file = merge_audio_files("test456", total_duration, video_script)
+    print(output_file)
diff --git a/app/services/clip_video.py b/app/services/clip_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b126bff4d4659a05b94480dca7f989e1ab27fe
--- /dev/null
+++ b/app/services/clip_video.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : clip_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午6:14
+'''
+
+import os
+import subprocess
+import json
+import hashlib
+from loguru import logger
+from typing import Dict, List, Optional
+from pathlib import Path
+
+from app.utils import ffmpeg_utils
+
+
+def parse_timestamp(timestamp: str) -> tuple:
+    """
+    解析时间戳字符串，返回开始和结束时间
+
+    Args:
+        timestamp: 格式为'HH:MM:SS-HH:MM:SS'或'HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串
+
+    Returns:
+        tuple: (开始时间, 结束时间) 格式为'HH:MM:SS'或'HH:MM:SS,sss'
+    """
+    start_time, end_time = timestamp.split('-')
+    return start_time, end_time
+
+
+def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
+    """
+    根据开始时间和持续时间计算结束时间
+
+    Args:
+        start_time: 开始时间，格式为'HH:MM:SS'或'HH:MM:SS,sss'(带毫秒)
+        duration: 持续时间，单位为秒
+        extra_seconds: 额外添加的秒数，默认为1秒
+
+    Returns:
+        str: 计算后的结束时间，格式与输入格式相同
+    """
+    # 检查是否包含毫秒
+    has_milliseconds = ',' in start_time
+    milliseconds = 0
+
+    if has_milliseconds:
+        time_part, ms_part = start_time.split(',')
+        h, m, s = map(int, time_part.split(':'))
+        milliseconds = int(ms_part)
+    else:
+        h, m, s = map(int, start_time.split(':'))
+
+    # 转换为总毫秒数
+    total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds +
+                          int((duration + extra_seconds) * 1000))
+
+    # 计算新的时、分、秒、毫秒
+    ms_new = total_milliseconds % 1000
+    total_seconds = total_milliseconds // 1000
+    h_new = int(total_seconds // 3600)
+    m_new = int((total_seconds % 3600) // 60)
+    s_new = int(total_seconds % 60)
+
+    # 返回与输入格式一致的时间字符串
+    if has_milliseconds:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}"
+    else:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d}"
+
+
+def check_hardware_acceleration() -> Optional[str]:
+    """
+    检查系统支持的硬件加速选项
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用集中式硬件加速检测
+    return ffmpeg_utils.get_ffmpeg_hwaccel_type()
+
+
+def clip_video(
+        video_origin_path: str,
+        tts_result: List[Dict],
+        output_dir: Optional[str] = None,
+        task_id: Optional[str] = None
+) -> Dict[str, str]:
+    """
+    根据时间戳裁剪视频
+
+    Args:
+        video_origin_path: 原始视频的路径
+        tts_result: 包含时间戳和持续时间信息的列表
+        output_dir: 输出目录路径，默认为None时会自动生成
+        task_id: 任务ID，用于生成唯一的输出目录，默认为None时会自动生成
+
+    Returns:
+        Dict[str, str]: 时间戳到裁剪后视频路径的映射
+    """
+    # 检查视频文件是否存在
+    if not os.path.exists(video_origin_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
+
+    # 如果未提供task_id，则根据输入生成一个唯一ID
+    if task_id is None:
+        content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}"
+        task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
+
+    # 设置输出目录
+    if output_dir is None:
+        output_dir = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+            "storage", "temp", "clip_video", task_id
+        )
+
+    # 确保输出目录存在
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # 获取硬件加速支持
+    hwaccel = check_hardware_acceleration()
+    hwaccel_args = []
+    if hwaccel:
+        hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+    # 存储裁剪结果
+    result = {}
+
+    for item in tts_result:
+        _id = item.get("_id", item.get("timestamp", "unknown"))
+        timestamp = item["timestamp"]
+        start_time, _ = parse_timestamp(timestamp)
+
+        # 根据持续时间计算真正的结束时间（加上1秒余量）
+        duration = item["duration"]
+        calculated_end_time = calculate_end_time(start_time, duration)
+
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_time.replace(',', '.')
+        ffmpeg_end_time = calculated_end_time.replace(',', '.')
+
+        # 格式化输出文件名（使用连字符替代冒号和逗号）
+        safe_start_time = start_time.replace(':', '-').replace(',', '-')
+        safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
+        output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+        output_path = os.path.join(output_dir, output_filename)
+
+        # 构建FFmpeg命令
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", video_origin_path,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
+            "-c:a", "aac",
+            "-strict", "experimental",
+            output_path
+        ]
+
+        # 执行FFmpeg命令
+        try:
+            logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+            # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+
+            # 在Windows系统上使用UTF-8编码处理输出，避免GBK编码错误
+            is_windows = os.name == 'nt'
+            if is_windows:
+                process = subprocess.run(
+                    ffmpeg_cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    encoding='utf-8',  # 明确指定编码为UTF-8
+                    text=True,
+                    check=True
+                )
+            else:
+                process = subprocess.run(
+                    ffmpeg_cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    check=True
+                )
+
+            result[_id] = output_path
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"裁剪视频片段失败: {timestamp}")
+            logger.error(f"错误信息: {e.stderr}")
+            raise RuntimeError(f"视频裁剪失败: {e.stderr}")
+
+    return result
+
+
+if __name__ == "__main__":
+    video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4"
+
+    tts_result = [{'timestamp': '00:00:00-00:01:15',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+                   'duration': 25.55,
+                   'text': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！'},
+                  {'timestamp': '00:01:15-00:04:40',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+                   'duration': 13.488,
+                   'text': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…'},
+                  {'timestamp': '00:04:58-00:05:45',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+                   'duration': 21.363,
+                   'text': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！'},
+                  {'timestamp': '00:05:45-00:06:00',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt',
+                   'duration': 7.675, 'text': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！'}]
+    subclip_path_videos = {
+        '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4',
+        '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4',
+        '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
+        '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4',
+        '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4',
+        '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4',
+    }
+
+    # 使用方法示例
+    try:
+        result = clip_video(video_origin_path, tts_result, subclip_path_videos)
+        print("裁剪结果:")
+        print(json.dumps(result, indent=4, ensure_ascii=False))
+    except Exception as e:
+        print(f"发生错误: {e}")
diff --git a/app/services/generate_narration_script.py b/app/services/generate_narration_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6640dbf891332326fdc60ecd2351b948dad392e
--- /dev/null
+++ b/app/services/generate_narration_script.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 生成介绍文案
+@Author : 小林同学
+@Date   : 2025/5/8 上午11:33 
+'''
+
+import json
+import os
+import traceback
+from openai import OpenAI
+from loguru import logger
+
+
+def parse_frame_analysis_to_markdown(json_file_path):
+    """
+    解析视频帧分析JSON文件并转换为Markdown格式
+    
+    :param json_file_path: JSON文件路径
+    :return: Markdown格式的字符串
+    """
+    # 检查文件是否存在
+    if not os.path.exists(json_file_path):
+        return f"错误: 文件 {json_file_path} 不存在"
+    
+    try:
+        # 读取JSON文件
+        with open(json_file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+        
+        # 初始化Markdown字符串
+        markdown = ""
+        
+        # 获取总结和帧观察数据
+        summaries = data.get('overall_activity_summaries', [])
+        frame_observations = data.get('frame_observations', [])
+        
+        # 按批次组织数据
+        batch_frames = {}
+        for frame in frame_observations:
+            batch_index = frame.get('batch_index')
+            if batch_index not in batch_frames:
+                batch_frames[batch_index] = []
+            batch_frames[batch_index].append(frame)
+        
+        # 生成Markdown内容
+        for i, summary in enumerate(summaries, 1):
+            batch_index = summary.get('batch_index')
+            time_range = summary.get('time_range', '')
+            batch_summary = summary.get('summary', '')
+            
+            markdown += f"## 片段 {i}\n"
+            markdown += f"- 时间范围：{time_range}\n"
+            
+            # 添加片段描述
+            markdown += f"- 片段描述：{batch_summary}\n" if batch_summary else f"- 片段描述：\n"
+            
+            markdown += "- 详细描述：\n"
+            
+            # 添加该批次的帧观察详情
+            frames = batch_frames.get(batch_index, [])
+            for frame in frames:
+                timestamp = frame.get('timestamp', '')
+                observation = frame.get('observation', '')
+                
+                # 直接使用原始文本，不进行分割
+                markdown += f"  - {timestamp}: {observation}\n" if observation else f"  - {timestamp}: \n"
+            
+            markdown += "\n"
+        
+        return markdown
+    
+    except Exception as e:
+        return f"处理JSON文件时出错: {traceback.format_exc()}"
+
+
+def generate_narration(markdown_content, api_key, base_url, model):
+    """
+    调用OpenAI API根据视频帧分析的Markdown内容生成解说文案
+    
+    :param markdown_content: Markdown格式的视频帧分析内容
+    :param api_key: OpenAI API密钥
+    :param base_url: API基础URL，如果使用非官方API
+    :param model: 使用的模型名称
+    :return: 生成的解说文案
+    """
+    try:
+        # 构建提示词
+        prompt = """
+我是一名荒野建造解说的博主，以下是一些同行的对标文案，请你深度学习并总结这些文案的风格特点跟内容特点：
+
+<example_text_1>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程可以说每一帧都是极致享受，我保证强迫症来了都找不出一丁点毛病。更别说全屋严丝合缝的拼接工艺，还能轻松抵御零下二十度气温，让你居住的每一天都温暖如春。
+在家闲不住的西姆今天也打算来一次野外建造，行走没多久他就发现许多倒塌的树，任由它们自生自灭不如将其利用起来。想到这他就开始挥舞铲子要把地基挖掘出来，虽然每次只能挖一点点，但架不住他体能惊人。没多长时间一个 2x3 的深坑就赫然出现，这深度住他一人绰绰有余。
+随后他去附近收集来原木，这些都是搭建墙壁的最好材料。而在投入使用前自然要把表皮刮掉，防止森林中的白蚁蛀虫。处理好一大堆后西姆还在两端打孔，使用木钉固定在一起。这可不是用来做墙壁的，而是做庇护所的承重柱。只要木头间的缝隙足够紧密，那搭建出的木屋就能足够坚固。
+每向上搭建一层，他都会在中间塞入苔藓防寒，保证不会泄露一丝热量。其他几面也是用相同方法，很快西姆就做好了三面墙壁，每一根木头都极其工整，保证强迫症来了都要点个赞再走。
+在继续搭建墙壁前西姆决定将壁炉制作出来，毕竟森林夜晚的气温会很低，保暖措施可是重中之重。完成后他找来一块大树皮用来充当庇护所的大门，而上面刮掉的木屑还能作为壁炉的引火物，可以说再完美不过。
+测试了排烟没问题后他才开始搭建最后一面墙壁，这一面要预留门和窗，所以在搭建到一半后还需要在原木中间开出卡口，让自己劈砍时能轻松许多。此时只需将另外一根如法炮制，两端拼接在一起后就是一扇大小适中的窗户。而随着随后一层苔藓铺好，最后一根原木落位，这个庇护所的雏形就算完成。
+大门的安装他没选择用合页，而是在底端雕刻出榫头，门框上则雕刻出榫眼，只能说西姆的眼就是一把尺，这完全就是严丝合缝。此时他才开始搭建屋顶。这里西姆用的方法不同，他先把最外围的原木固定好，随后将原木平铺在上面，就能得到完美的斜面屋顶。等他将四周的围栏也装好后，工整的屋顶看起来十分舒服，西姆躺上去都不想动。
+稍作休息后，他利用剩余的苔藓，对屋顶的缝隙处密封。可这样西姆觉得不够保险，于是他找来一些黏土，再次对原本的缝隙二次加工，保管这庇护所冬天也暖和。最后只需要平铺上枯叶，以及挖掘出的泥土，整个屋顶就算完成。
+考虑到庇护所的美观性，自然少不了覆盖上苔藓，翠绿的颜色看起来十分舒服。就连门口的庭院旁，他都移植了许多小树做点缀，让这木屋与周边环境融为一体。西姆才刚完成好这件事，一场大雨就骤然降临。好在此时的他已经不用淋雨，更别说这屋顶防水十分不错，室内没一点雨水渗透进来。
+等待温度回升的过程，西姆利用墙壁本身的凹槽，把床框镶嵌在上面，只需要铺上苔藓，以及自带的床单枕头，一张完美的单人床就做好。辛苦劳作一整天，西姆可不会亏待自己。他将自带的牛肉腌制好后，直接放到壁炉中烤，只需要等待三十分钟，就能享受这美味的一顿。
+在辛苦建造一星期后，他终于可以在自己搭建的庇护所中，享受最纯正的野外露营。后面西姆回家补给了一堆物资，再次回来时森林已经大雪纷飞，让他原本翠绿的小屋，更换上了冬季限定皮肤。好在内部设施没受什么影响，和他离开时一样整洁。
+就是房间中已经没多少柴火，让西姆今天又得劈柴。寒冷干燥的天气，让木头劈起来十分轻松。没多久他就收集到一大堆，这些足够燃烧好几天。虽然此时外面大雪纷飞，但小屋中却开始逐渐温暖。这次他除了带来一些食物外，还有几瓶调味料，以及一整套被褥，让自己的居住舒适度提高一大截。
+而秋天他有收集干草的缘故，只需要塞入枕套中密封起来，就能作为靠垫用。就这居住条件，比一般人在家过的还要奢侈。趁着壁炉木头变木炭的过程，西姆则开始不紧不慢的处理食物。他取出一块牛排，改好花刀以后，撒上一堆调料腌制起来。接着用锡纸包裹好，放到壁炉中直接炭烤，搭配上自带的红酒，是一个非常好的选择。
+随着时间来到第二天，外面的积雪融化了不少，西姆简单做顿煎蛋补充体力后，决定制作一个室外篝火堆，用来晚上驱散周边野兽。搭建这玩意没什么技巧，只需要找到一大堆木棍，利用大树的夹缝将其掰弯，然后将其堆积在一起，就是一个简易版的篝火堆。看这外形有点像帐篷，好在西姆没想那么多。
+等待天色暗淡下来后，他才来到室外将其点燃，顺便处理下多余的废料。只可惜这场景没朋友陪在身边，对西姆来说可能是个遗憾。而哪怕森林只有他一个人，都依旧做了好几个小时。等到里面的篝火彻底燃尽后，西姆还找来雪球，覆盖到上面将火熄灭，这防火意识可谓十分好。最后在室内二十五度的高温下，裹着被子睡觉。
+</example_text_1>
+
+<example_text_2>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程每一帧都是极致享受，全屋严丝合缝的拼接工艺，能轻松抵御零下二十度气温，居住体验温暖如春。
+在家闲不住的西姆开启野外建造。他发现倒塌的树，决定加以利用。先挖掘出 2x3 的深坑作为地基，接着收集原木，刮掉表皮防白蚁蛀虫，打孔用木钉固定制作承重柱。搭建墙壁时，每一层都塞入苔藓防寒，很快做好三面墙。
+为应对森林夜晚低温，西姆制作壁炉，用大树皮当大门，刮下的木屑做引火物。搭建最后一面墙时预留门窗，通过在原木中间开口拼接做出窗户。大门采用榫卯结构安装，严丝合缝。
+搭建屋顶时，先固定外围原木，再平铺原木形成斜面屋顶，之后用苔藓、黏土密封缝隙，铺上枯叶和泥土。为美观，在木屋覆盖苔藓，移植小树点缀。完工时遇大雨，木屋防水良好。
+西姆利用墙壁凹槽镶嵌床框，铺上苔藓、床单枕头做成床。劳作一天后，他用壁炉烤牛肉享用。建造一星期后，他开始野外露营。
+后来西姆回家补给物资，回来时森林大雪纷飞。他劈柴储备，带回食物、调味料和被褥，提高居住舒适度，还用干草做靠垫。他用壁炉烤牛排，搭配红酒。
+第二天，积雪融化，西姆制作室外篝火堆防野兽。用大树夹缝掰弯木棍堆积而成，晚上点燃处理废料，结束后用雪球灭火，最后在室内二十五度的环境中裹被入睡。
+</example_text_2>
+
+<example_text_3>
+如果战争到来，这个深埋地下十几米的庇护所绝对是 bug 般的存在。即使被敌人发现，还能通过快速通道一秒逃出。里面不仅有竹子、地暖、地下水井，还自制抽水机。在解决用水问题的同时，甚至自研无土栽培技术，过上完全自给自足的生活。
+阿伟的老婆美如花，但阿伟从来不回家，来到野外他乐哈哈，一言不合就开挖。众所周知当战争来临时，地下堡垒的安全性是最高的。阿伟苦苦研习两载半，只为练就一身挖洞本领。在这双逆天麒麟臂的加持下，如此坚硬的泥土都只能当做炮灰。
+得到了充足的空间后，他便开始对这些边缘进行打磨。随后阿伟将细线捆在木棍上，以此描绘出圆柱的轮廓。接着再一点点铲掉多余的部分。虽然是由泥土一体式打造，但这样的桌子保准用上千年都不成问题。
+考虑到十几米的深度进出非常不方便，于是阿伟找来两根长达 66.6 米的木头，打算为庇护所打造一条快速通道。只见他将木桩牢牢地插入地下，并顺着洞口的方向延伸出去，直到贯穿整个山洞。接着在每个木桩的连接处钉入铁钉，确保轨道不能有一毫米的偏差。完成后再制作一个木质框架，从而达到前后滑动的效果。
+不得不说阿伟这手艺简直就是大钢管子杵青蛙。在上面放上一个木制的车斗，还能加快搬运泥土的速度。没多久庇护所的内部就已经初见雏形。为了住起来更加舒适，还需要为自己打造一张床。虽然深处的泥土同样很坚固，但好处就是不用担心垮塌的风险。
+阿伟不仅设计了更加符合人体工学的拱形，并且还在一旁雕刻处壁龛。就是这氛围怎么看着有点不太吉利。别看阿伟一身腱子肉，但这身体里的艺术细菌可不少。每个边缘的地方他都做了精雕细琢，瞬间让整个卧室的颜值提升一大截。
+住在地下的好处就是房子面积全靠挖，每平方消耗两个半馒头。不仅没有了房贷的压力，就连买墓地的钱也省了。阿伟将中间的墙壁挖空，从而得到取暖的壁炉。当然最重要的还有排烟问题，要想从上往下打通十几米的山体是件极其困难的事。好在阿伟年轻时报过忆坤年的古墓派补习班，这打洞技术堪比隔壁学校的土拨鼠专业。虽然深度长达十几米，但排烟效果却一点不受影响，一个字专业！
+随后阿伟继续对壁炉底部雕刻，打通了底部放柴火的空间，并制作出放锅的灶头。完成后阿伟从侧面将壁炉打通，并制作出一条导热的通道，以此连接到床铺的位置。毕竟住在这么一个风湿宝地，不注意保暖除湿很容易得老寒腿。
+阿伟在床面上挖出一条条管道，以便于温度能传输到床的每个角落。接下来就可以根据这些通道的长度裁切出同样长短的竹子，根据竹筒的大小凿出相互连接的孔洞，最后再将竹筒内部打通，以达到温度传送的效果。
+而后阿伟将这些管道安装到凹槽内，在他严谨的制作工艺下，每根竹子刚好都能镶嵌进去。在铺设床面之前还需要用木塞把圆孔堵住，防止泥土掉落进管道。泥土虽然不能隔绝湿气，但却是十分优良的导热材料。等他把床面都压平后就可以小心的将这些木塞拔出来，最后再用黏土把剩余的管道也遮盖起来，直到整个墙面恢复原样。
+接下来还需要测试一下加热效果，当他把火点起来后，温度很快就传送到了管道内，把火力一点点加大，直到热气流淌到更远的床面。随着小孔里的青烟冒出，也预示着阿伟的地暖可以投入使用。而后阿伟制作了一些竹条，并用细绳将它们喜结连理。
+千里之行始于足下，美好的家园要靠自己双手打造。明明可以靠才艺吃饭的阿伟偏偏要用八块腹肌征服大家，就问这样的男人哪个野生婆娘不喜欢？完成后阿伟还用自己 35 码的大腚感受了一下，真烫！
+随后阿伟来到野区找到一根上好的雷击木，他当即就把木头咔嚓成两段，并取下两节较为完整的带了回去，刚好能和圆桌配套。另外一个在里面凿出凹槽，并插入木棍连接，得到一个夯土的木锤。住过农村的小伙伴都知道，这样夯出来的地面堪比水泥地，不仅坚硬耐磨，还不用担心脚底打滑。忙碌了一天的阿伟已经饥渴难耐，拿出野生小烤肠，安安心心住新房，光脚爬上大热炕，一觉能睡到天亮。
+第二天阿伟打算将房间扩宽，毕竟吃住的地方有了，还要解决个人卫生的问题。阿伟在另一侧增加了一个房间，他打算将这里打造成洗澡的地方。为了防止泥土垮塌，他将顶部做成圆弧形，等挖出足够的空间后，旁边的泥土已经堆成了小山。
+为了方便清理这些泥土，阿伟在之前的轨道增加了转弯，交接处依然是用铁钉固定，一直延伸到房间的最里面。有了运输车的帮助，这些成吨的泥土也能轻松的运送出去，并且还能体验过山车的感觉。很快他就完成了清理工作。
+为了更方便的在里面洗澡，他将底部一点点挖空，这么大的浴缸，看来阿伟并不打算一个人住。完成后他将墙面雕刻的凹凸有致，让这里看起来更加豪华。接着用洛阳铲挖出排水口，并用一根相同大小的竹筒作为开关。
+由于四周都是泥土还不能防水，阿伟特意找了一些白蚁巢，用来制作可以防水的野生水泥。现在就可以将里里外外，能接触到水的地方都涂抹一遍。细心的阿伟还找来这种 500 克一斤的鹅卵石，对池子表面进行装饰。
+没错，水源问题阿伟早已经考虑在内，他打算直接在旁边挖个水井，毕竟已经挖了这么深，再向下挖一挖，应该就能到达地下水的深度。经过几日的奋战，能看得出阿伟已经消瘦了不少，但一想到马上就能拥有的豪宅，他直接化身为无情的挖土机器，很快就挖到了好几米的深度。
+考虑到自己的弹跳力有限，阿伟在一旁定入木桩，然后通过绳子爬上爬下。随着深度越来越深，井底已经开始渗出水来，这也预示着打井成功。没多久这里面将渗满泉水，仅凭一次就能挖到水源，看来这里还真是块风湿宝地。
+随后阿伟在井口四周挖出凹槽，以便于井盖的安置。这一量才知道，井的深度已经达到了足足的 5 米。阿伟把木板组合在一起，再沿着标记切掉多余部分，他甚至还给井盖做了把手。可是如何从这么深的井里打水还是个问题，但从阿伟坚定的眼神来看，他应该想到了解决办法。
+只见他将树桩锯成两半，然后用凿子把里面一点点掏空，另外一半也是如法炮制。接着还要在底部挖出圆孔，要想成功将水从 5 米深的地方抽上来，那就不得不提到大家熟知的勾股定理。没错，这跟勾股定理没什么关系。
+阿伟给竹筒做了一个木塞，并在里面打上安装连接轴的孔。为了增加密闭性，阿伟不得不牺牲了自己的 AJ，剪出与木塞相同的大小后，再用木钉固定住。随后他收集了一些树胶，并放到火上加热融化。接下来就可以涂在木塞上增加使用寿命。
+现在将竹筒组装完成，就可以利用虹吸原理将水抽上来。完成后就可以把井盖盖上去，再用泥土在上面覆盖，现在就不用担心失足掉下去了。
+接下来阿伟去采集了一些大漆，将它涂抹在木桶接缝处，就能将其二合为一。完了再接入旁边浴缸的入水口，每个连接的地方都要做好密封，不然后面很容易漏水。随后就可以安装上活塞，并用一根木桩作为省力杠杆，根据空气压强的原理将井水抽上来。
+经过半小时的来回拉扯，硕大的浴缸终于被灌满，阿伟也是忍不住洗了把脸。接下来还需要解决排水的问题，阿伟在地上挖出沟渠，一直贯穿到屋外，然后再用竹筒从出水口连接，每个接口处都要抹上胶水，就连门外的出水口他都做了隐藏。
+在野外最重要的就是庇护所、水源还有食物。既然已经完成了前二者，那么阿伟还需要拥有可持续发展的食物来源。他先是在地上挖了两排地洞，然后在每根竹筒的表面都打上无数孔洞，这就是他打算用来种植的载体。在此之前，还需要用大火对竹筒进行杀菌消毒。
+趁着这时候，他去搬了一麻袋的木屑，先用芭蕉叶覆盖在上面，再铺上厚厚的黏土隔绝温度。在火焰的温度下，能让里面的木屑达到生长条件。
+等到第二天所有材料都晾凉后，阿伟才将竹筒内部掏空，并将木屑一点点地塞入竹筒。一切准备就绪，就可以将竹筒插入提前挖好的地洞。最后再往竹筒里塞入种子，依靠房间内的湿度和温度，就能达到大棚种植的效果。稍加时日，这些种子就会慢慢发芽。
+虽然暂时还吃不上自己培养的食物，但好在阿伟从表哥贺强那里学到不少钓鱼本领，哪怕只有一根小小的竹竿，也能让他钓上两斤半的大鲶鱼。新鲜的食材，那肯定是少不了高温消毒的过程。趁着鱼没熟，阿伟直接爬进浴缸，冰凉的井水瞬间洗去了身上的疲惫。这一刻的阿伟是无比的享受。
+不久后鱼也烤得差不多了，阿伟的生活现在可以说是有滋有味。住在十几米的地下，不仅能安全感满满，哪怕遇到危险，还能通过轨道快速逃生。
+<example_text_3>
+
+<video_frame_description>
+%s
+</video_frame_description>
+
+我正在尝试做这个内容的解说纪录片视频，我需要你以 <video_frame_description> </video_frame_description> 中的内容为解说目标，根据我刚才提供给你的对标文案 <example_text> 特点，以及你总结的特点，帮我生成一段关于荒野建造的解说文案，文案需要符合平台受欢迎的解说风格，请使用 json 格式进行输出；使用 <output> 中的输出格式：
+
+<output>
+{
+  "items": [
+    {
+        "_id": 1, # 唯一递增id
+        "timestamp": "00:00:05,390-00:00:10,430",
+        "picture": "画面描述",
+        "narration": "解说文案",
+    }
+}
+</output>
+
+<restriction>
+1. 只输出 json 内容，不要输出其他任何说明性的文字
+2. 解说文案的语言使用 简体中文
+3. 严禁虚构画面，所有画面只能从 <video_frame_description> 中摘取
+</restriction>
+""" % (markdown_content)
+
+        # 使用OpenAI SDK初始化客户端
+        client = OpenAI(
+            api_key=api_key,
+            base_url=base_url
+        )
+        
+        # 使用SDK发送请求
+        if model not in ["deepseek-reasoner"]:
+            # deepseek-reasoner 不支持 json 输出
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+                response_format={"type": "json_object"},
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"消耗的tokens: {response.usage.total_tokens}")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+        else:
+            # 不支持 json 输出，需要多一步处理 ```json ``` 的步骤
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}")
+                # 清理 narration_script 字符串前后的 ```json ``` 字符串
+                narration_script = narration_script.replace("```json", "").replace("```", "")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+    
+    except Exception as e:
+        return f"调用API生成解说文案时出错: {traceback.format_exc()}"
+
+
+if __name__ == '__main__':
+    text_provider = 'openai'
+    text_api_key = "sk-xxx"
+    text_model = "deepseek-reasoner"
+    text_base_url = "https://api.deepseek.com"
+    video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json"
+
+    # 测试新的JSON文件
+    test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_2258.json"
+    markdown_output = parse_frame_analysis_to_markdown(test_file_path)
+    # print(markdown_output)
+    
+    # 输出到文件以便检查格式
+    output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/家里家外1-5.md"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(markdown_output)
+    # print(f"\n已将Markdown输出保存到: {output_file}")
+    
+    # # 生成解说文案
+    # narration = generate_narration(
+    #     markdown_output,
+    #     text_api_key,
+    #     base_url=text_base_url,
+    #     model=text_model
+    # )
+    #
+    # # 保存解说文案
+    # print(narration)
+    # print(type(narration))
+    # narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json"
+    # with open(narration_file, 'w', encoding='utf-8') as f:
+    #     f.write(narration)
+    # print(f"\n已将解说文案保存到: {narration_file}")
diff --git a/app/services/generate_video.py b/app/services/generate_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..f125c0564559b582dc249134da3eda3626d4735a
--- /dev/null
+++ b/app/services/generate_video.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : generate_video
+@Author : 小林同学
+@Date   : 2025/5/7 上午11:55 
+'''
+
+import os
+import traceback
+from typing import Optional, Dict, Any
+from loguru import logger
+from moviepy import (
+    VideoFileClip,
+    AudioFileClip,
+    CompositeAudioClip,
+    CompositeVideoClip,
+    TextClip,
+    afx
+)
+from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont
+
+from app.utils import utils
+
+
+def merge_materials(
+    video_path: str,
+    audio_path: str,
+    output_path: str,
+    subtitle_path: Optional[str] = None,
+    bgm_path: Optional[str] = None,
+    options: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    合并视频、音频、BGM和字幕素材生成最终视频
+    
+    参数:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
+        output_path: 输出文件路径
+        subtitle_path: 字幕文件路径，可选
+        bgm_path: 背景音乐文件路径，可选
+        options: 其他选项配置，可包含以下字段:
+            - voice_volume: 人声音量，默认1.0
+            - bgm_volume: 背景音乐音量，默认0.3
+            - original_audio_volume: 原始音频音量，默认0.0
+            - keep_original_audio: 是否保留原始音频，默认False
+            - subtitle_font: 字幕字体，默认None，系统会使用默认字体
+            - subtitle_font_size: 字幕字体大小，默认40
+            - subtitle_color: 字幕颜色，默认白色
+            - subtitle_bg_color: 字幕背景颜色，默认透明
+            - subtitle_position: 字幕位置，可选值'bottom', 'top', 'center'，默认'bottom'
+            - custom_position: 自定义位置
+            - stroke_color: 描边颜色，默认黑色
+            - stroke_width: 描边宽度，默认1
+            - threads: 处理线程数，默认2
+            - fps: 输出帧率，默认30
+            
+    返回:
+        输出视频的路径
+    """
+    # 合并选项默认值
+    if options is None:
+        options = {}
+    
+    # 设置默认参数值
+    voice_volume = options.get('voice_volume', 1.0)
+    bgm_volume = options.get('bgm_volume', 0.3)
+    original_audio_volume = options.get('original_audio_volume', 0.0)  # 默认为0，即不保留原声
+    keep_original_audio = options.get('keep_original_audio', False)  # 是否保留原声
+    subtitle_font = options.get('subtitle_font', '')
+    subtitle_font_size = options.get('subtitle_font_size', 40)
+    subtitle_color = options.get('subtitle_color', '#FFFFFF')
+    subtitle_bg_color = options.get('subtitle_bg_color', 'transparent')
+    subtitle_position = options.get('subtitle_position', 'bottom')
+    custom_position = options.get('custom_position', 70)
+    stroke_color = options.get('stroke_color', '#000000')
+    stroke_width = options.get('stroke_width', 1)
+    threads = options.get('threads', 2)
+    fps = options.get('fps', 30)
+    
+    # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值
+    if subtitle_bg_color == 'transparent':
+        subtitle_bg_color = None  # None在新版MoviePy中表示透明背景
+    
+    # 创建输出目录（如果不存在）
+    output_dir = os.path.dirname(output_path)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    logger.info(f"开始合并素材...")
+    logger.info(f"  ① 视频: {video_path}")
+    logger.info(f"  ② 音频: {audio_path}")
+    if subtitle_path:
+        logger.info(f"  ③ 字幕: {subtitle_path}")
+    if bgm_path:
+        logger.info(f"  ④ 背景音乐: {bgm_path}")
+    logger.info(f"  ⑤ 输出: {output_path}")
+    
+    # 加载视频
+    try:
+        video_clip = VideoFileClip(video_path)
+        logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}秒")
+        
+        # 提取视频原声(如果需要)
+        original_audio = None
+        if keep_original_audio and original_audio_volume > 0:
+            try:
+                original_audio = video_clip.audio
+                if original_audio:
+                    original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)])
+                    logger.info(f"已提取视频原声，音量设置为: {original_audio_volume}")
+                else:
+                    logger.warning("视频没有音轨，无法提取原声")
+            except Exception as e:
+                logger.error(f"提取视频原声失败: {str(e)}")
+                original_audio = None
+        
+        # 移除原始音轨，稍后会合并新的音频
+        video_clip = video_clip.without_audio()
+        
+    except Exception as e:
+        logger.error(f"加载视频失败: {str(e)}")
+        raise
+    
+    # 处理背景音乐和所有音频轨道合成
+    audio_tracks = []
+
+    # 先添加主音频（配音）
+    if audio_path and os.path.exists(audio_path):
+        try:
+            voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)])
+            audio_tracks.append(voice_audio)
+            logger.info(f"已添加配音音频，音量: {voice_volume}")
+        except Exception as e:
+            logger.error(f"加载配音音频失败: {str(e)}")
+
+    # 添加原声（如果需要）
+    if original_audio is not None:
+        audio_tracks.append(original_audio)
+        logger.info(f"已添加视频原声，音量: {original_audio_volume}")
+
+    # 添加背景音乐（如果有）
+    if bgm_path and os.path.exists(bgm_path):
+        try:
+            bgm_clip = AudioFileClip(bgm_path).with_effects([
+                afx.MultiplyVolume(bgm_volume),
+                afx.AudioFadeOut(3),
+                afx.AudioLoop(duration=video_clip.duration),
+            ])
+            audio_tracks.append(bgm_clip)
+            logger.info(f"已添加背景音乐，音量: {bgm_volume}")
+        except Exception as e:
+            logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}")
+
+    # 合成最终的音频轨道
+    if audio_tracks:
+        final_audio = CompositeAudioClip(audio_tracks)
+        video_clip = video_clip.with_audio(final_audio)
+        logger.info(f"已合成所有音频轨道，共{len(audio_tracks)}个")
+    else:
+        logger.warning("没有可用的音频轨道，输出视频将没有声音")
+    
+    # 处理字体路径
+    font_path = None
+    if subtitle_path and subtitle_font:
+        font_path = os.path.join(utils.font_dir(), subtitle_font)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+        logger.info(f"使用字体: {font_path}")
+    
+    # 处理视频尺寸
+    video_width, video_height = video_clip.size
+    
+    # 字幕处理函数
+    def create_text_clip(subtitle_item):
+        """创建单个字幕片段"""
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        
+        # 如果有字体路径，进行文本换行处理
+        wrapped_txt = phrase
+        txt_height = 0
+        if font_path:
+            wrapped_txt, txt_height = wrap_text(
+                phrase, 
+                max_width=max_width, 
+                font=font_path, 
+                fontsize=subtitle_font_size
+            )
+        
+        # 创建文本片段
+        try:
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+                bg_color=subtitle_bg_color,  # 这里已经在前面处理过，None表示透明
+                stroke_color=stroke_color,
+                stroke_width=stroke_width,
+            )
+        except Exception as e:
+            logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试")
+            # 如果上面的方法失败，尝试使用更简单的参数
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+            )
+        
+        # 设置字幕时间
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.with_start(subtitle_item[0][0])
+        _clip = _clip.with_end(subtitle_item[0][1])
+        _clip = _clip.with_duration(duration)
+        
+        # 设置字幕位置
+        if subtitle_position == "bottom":
+            _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
+        elif subtitle_position == "top":
+            _clip = _clip.with_position(("center", video_height * 0.05))
+        elif subtitle_position == "custom":
+            margin = 10
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (custom_position / 100)
+            custom_y = max(
+                min_y, min(custom_y, max_y)
+            )
+            _clip = _clip.with_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.with_position(("center", "center"))
+            
+        return _clip
+        
+    # 创建TextClip工厂函数
+    def make_textclip(text):
+        return TextClip(
+            text=text,
+            font=font_path,
+            font_size=subtitle_font_size,
+            color=subtitle_color,
+        )
+    
+    # 处理字幕
+    if subtitle_path and os.path.exists(subtitle_path):
+        try:
+            # 加载字幕文件
+            sub = SubtitlesClip(
+                subtitles=subtitle_path, 
+                encoding="utf-8", 
+                make_textclip=make_textclip
+            )
+            
+            # 创建每个字幕片段
+            text_clips = []
+            for item in sub.subtitles:
+                clip = create_text_clip(subtitle_item=item)
+                text_clips.append(clip)
+                
+            # 合成视频和字幕
+            video_clip = CompositeVideoClip([video_clip, *text_clips])
+            logger.info(f"已添加{len(text_clips)}个字幕片段")
+        except Exception as e:
+            logger.error(f"处理字幕失败: \n{traceback.format_exc()}")
+    
+    # 导出最终视频
+    try:
+        video_clip.write_videofile(
+            output_path,
+            audio_codec="aac",
+            temp_audiofile_path=output_dir,
+            threads=threads,
+            fps=fps,
+        )
+        logger.success(f"素材合并完成: {output_path}")
+    except Exception as e:
+        logger.error(f"导出视频失败: {str(e)}")
+        raise
+    finally:
+        # 释放资源
+        video_clip.close()
+        del video_clip
+    
+    return output_path
+
+
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    """
+    文本换行函数，使长文本适应指定宽度
+    
+    参数:
+        text: 需要换行的文本
+        max_width: 最大宽度（像素）
+        font: 字体路径
+        fontsize: 字体大小
+        
+    返回:
+        换行后的文本和文本高度
+    """
+    # 创建ImageFont对象
+    try:
+        font_obj = ImageFont.truetype(font, fontsize)
+    except:
+        # 如果无法加载指定字体，使用默认字体
+        font_obj = ImageFont.load_default()
+    
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font_obj.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    return result, height
+
+
+if __name__ == '__main__':
+    merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4'
+    merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt'
+    merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3'
+    bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4'
+    
+    # 调用示例
+    options = {
+        'voice_volume': 1.0,            # 配音音量
+        'bgm_volume': 0.1,              # 背景音乐音量
+        'original_audio_volume': 1.0,   # 视频原声音量，0表示不保留
+        'keep_original_audio': True,    # 是否保留原声
+        'subtitle_font': 'MicrosoftYaHeiNormal.ttc',  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': 40,
+        'subtitle_color': '#FFFFFF',
+        'subtitle_bg_color': None,      # 直接使用None表示透明背景
+        'subtitle_position': 'bottom',
+        'threads': 2
+    }
+    
+    try:
+        merge_materials(
+            video_path=merger_mp4,
+            audio_path=merger_audio,
+            subtitle_path=merger_sub,
+            bgm_path=bgm_path,
+            output_path=output_video,
+            options=options
+        )
+    except Exception as e:
+        logger.error(f"合并素材失败: \n{traceback.format_exc()}")
diff --git a/app/services/llm.py b/app/services/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db792087a569f01d9667ebba4eebac3830d6aaf
--- /dev/null
+++ b/app/services/llm.py
@@ -0,0 +1,808 @@
+import os
+import re
+import json
+import traceback
+import streamlit as st
+from typing import List
+from loguru import logger
+from openai import OpenAI
+from openai import AzureOpenAI
+from moviepy import VideoFileClip
+from openai.types.chat import ChatCompletion
+import google.generativeai as gemini
+from googleapiclient.errors import ResumableUploadError
+from google.api_core.exceptions import *
+from google.generativeai.types import *
+import subprocess
+from typing import Union, TextIO
+
+from app.config import config
+from app.utils.utils import clean_model_output
+
+_max_retries = 5
+
+Method = """
+重要提示：每一部剧的文案，前几句必须吸引人
+首先我们在看完看懂电影后，大脑里面要先有一个大概的轮廓，也就是一个类似于作文的大纲，电影主题线在哪里，首先要找到。
+一般将文案分为开头、内容、结尾
+## 开头部分
+文案开头三句话，是留住用户的关键！
+
+### 方式一：开头概括总结
+文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
+推荐新手（新号）做：（盘点型）
+盘点全球最恐怖的10部电影
+盘���全球最科幻的10部电影
+盘点全球最悲惨的10部电影
+盘全球最值得看的10部灾难电影
+盘点全球最值得看的10部励志电影
+
+下面的示例就是最简单的解说文案开头：
+1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
+2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
+3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
+4.是什么样的一个人被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
+5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
+6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
+7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《��代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
+8.这是一部让所有人看得荷尔蒙飙升的爽片……
+9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
+10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他是顶级神作《xxxx》……
+11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
+12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
+13.它被誉为史上最牛悬疑片无数人在看完它时候，一个月不敢照镜��，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
+15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
+16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
+17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
+
+### 方式：情景式、假设性开头
+1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
+2.你知道……吗？原来……然后开始叙述
+3.如果给你….，你会怎么样？
+4.如果你是….，你会怎么样？
+
+### 方式三：以国家为开头！简单明了。话语不需要多，但是需要讲解透彻！
+1.这是一部韩国最新灾难片，你一定没有看过……
+2.这是一部印度高分悬疑片，
+3.这部电影原在日本因为……而被下架，
+4.这是韩国最恐怖的犯罪片，
+5.这是最近国产片评分最高的悬疑��
+以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
+
+### 方式四：如何自由发挥
+正常情况下，每一部电影都有非常关键的一个大纲，这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影，就能找到这个主题大纲。
+我们提前把这个主题大纲给放到影视最前面，作为我们的前三句的文案，将会非常吸引人！
+
+例如：
+1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
+2.如果你男朋友出轨了，他不爱你了，还你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
+3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
+
+以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
+实在想不出来如何去提炼，可以去搜索这部剧，对这部电影的影评，也会给你带过来很多灵感的！
+
+
+## 内容部分
+开头有了，剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍，如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍，提取2-3句也可以！
+正常情况下，我们叙述的时候其实是非常简单的，把整部电影主题线，叙述下来，其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。
+
+以悬疑剧为例：
+竟然，突然，原来，但是，但，可是，结果，直到，如果，而，果然，发现，只是，出奇，之后，没错，不止，更是，当然，因为，所以……等！
+以上是比较常用的，当然还有很多，需要靠平时思考和阅读的积累！因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多，只有用到这些词。才能体现出各种反转剧情！
+建议大家在刚开始做的时候，做8分钟内的，不要太长，分成三段。每段也是不超过三分钟，这样时间刚好。可以比较好的完成完播率！
+
+
+## 结尾部分
+最后故事的结局，除了反转，可以来点人生的道理！如果刚开始不会，可以不写。
+后面水平越来越高的时候，可以进行人生道理的讲评。
+
+比如：这部电影告诉我们……
+类似于哲理性质��作为一个总结！
+也可以把最后的影视反转，原生放出来，留下悬念。
+
+比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
+其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
+做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平的制作全部时间！
+
+"""
+
+
+def handle_exception(err):
+    if isinstance(err, PermissionDenied):
+        raise Exception("403 用户没有权限访问该资源")
+    elif isinstance(err, ResourceExhausted):
+        raise Exception("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误")
+    elif isinstance(err, InvalidArgument):
+        raise Exception("400 参数无效。例如，文件过大，超出了载荷大小限制。另一个事件提供了无效的 API 密钥。")
+    elif isinstance(err, AlreadyExists):
+        raise Exception("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时，请指定唯一的模型 ID。")
+    elif isinstance(err, RetryError):
+        raise Exception("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。")
+    elif isinstance(err, BlockedPromptException):
+        raise Exception("400 出于安全原因，该提示已被屏蔽。")
+    elif isinstance(err, BrokenResponseError):
+        raise Exception("500 流式传输响应已损坏。在访问需要完整响应的内容（例如聊天记录）时引发。查看堆栈轨迹中提供的错误详情。")
+    elif isinstance(err, IncompleteIterationError):
+        raise Exception("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。")
+    elif isinstance(err, ConnectionError):
+        raise Exception("网络连接错误, 请检查您的网络连接(建议使用 NarratoAI 官方提供的 url)")
+    else:
+        raise Exception(f"大模型请求失败, 下面是具体报错信息: \n\n{traceback.format_exc()}")
+
+
+def _generate_response(prompt: str, llm_provider: str = None) -> str:
+    """
+    调用大模型通用方法
+        prompt：
+        llm_provider：
+    """
+    content = ""
+    if not llm_provider:
+        llm_provider = config.app.get("llm_provider", "openai")
+    logger.info(f"llm provider: {llm_provider}")
+    if llm_provider == "g4f":
+        model_name = config.app.get("g4f_model_name", "")
+        if not model_name:
+            model_name = "gpt-3.5-turbo-16k-0613"
+        import g4f
+
+        content = g4f.ChatCompletion.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    else:
+        api_version = ""  # for azure
+        if llm_provider == "moonshot":
+            api_key = config.app.get("moonshot_api_key")
+            model_name = config.app.get("moonshot_model_name")
+            base_url = "https://api.moonshot.cn/v1"
+        elif llm_provider == "ollama":
+            # api_key = config.app.get("openai_api_key")
+            api_key = "ollama"  # any string works but you are required to have one
+            model_name = config.app.get("ollama_model_name")
+            base_url = config.app.get("ollama_base_url", "")
+            if not base_url:
+                base_url = "http://localhost:11434/v1"
+        elif llm_provider == "openai":
+            api_key = config.app.get("openai_api_key")
+            model_name = config.app.get("openai_model_name")
+            base_url = config.app.get("openai_base_url", "")
+            if not base_url:
+                base_url = "https://api.openai.com/v1"
+        elif llm_provider == "oneapi":
+            api_key = config.app.get("oneapi_api_key")
+            model_name = config.app.get("oneapi_model_name")
+            base_url = config.app.get("oneapi_base_url", "")
+        elif llm_provider == "azure":
+            api_key = config.app.get("azure_api_key")
+            model_name = config.app.get("azure_model_name")
+            base_url = config.app.get("azure_base_url", "")
+            api_version = config.app.get("azure_api_version", "2024-02-15-preview")
+        elif llm_provider == "gemini":
+            api_key = config.app.get("gemini_api_key")
+            model_name = config.app.get("gemini_model_name")
+            base_url = "***"
+        elif llm_provider == "qwen":
+            api_key = config.app.get("qwen_api_key")
+            model_name = config.app.get("qwen_model_name")
+            base_url = "***"
+        elif llm_provider == "cloudflare":
+            api_key = config.app.get("cloudflare_api_key")
+            model_name = config.app.get("cloudflare_model_name")
+            account_id = config.app.get("cloudflare_account_id")
+            base_url = "***"
+        elif llm_provider == "deepseek":
+            api_key = config.app.get("deepseek_api_key")
+            model_name = config.app.get("deepseek_model_name")
+            base_url = config.app.get("deepseek_base_url")
+            if not base_url:
+                base_url = "https://api.deepseek.com"
+        elif llm_provider == "ernie":
+            api_key = config.app.get("ernie_api_key")
+            secret_key = config.app.get("ernie_secret_key")
+            base_url = config.app.get("ernie_base_url")
+            model_name = "***"
+            if not secret_key:
+                raise ValueError(
+                    f"{llm_provider}: secret_key is not set, please set it in the config.toml file."
+                )
+        else:
+            raise ValueError(
+                "llm_provider is not set, please set it in the config.toml file."
+            )
+
+        if not api_key:
+            raise ValueError(
+                f"{llm_provider}: api_key is not set, please set it in the config.toml file."
+            )
+        if not model_name:
+            raise ValueError(
+                f"{llm_provider}: model_name is not set, please set it in the config.toml file."
+            )
+        if not base_url:
+            raise ValueError(
+                f"{llm_provider}: base_url is not set, please set it in the config.toml file."
+            )
+
+        if llm_provider == "qwen":
+            import dashscope
+            from dashscope.api_entities.dashscope_response import GenerationResponse
+
+            dashscope.api_key = api_key
+            response = dashscope.Generation.call(
+                model=model_name, messages=[{"role": "user", "content": prompt}]
+            )
+            if response:
+                if isinstance(response, GenerationResponse):
+                    status_code = response.status_code
+                    if status_code != 200:
+                        raise Exception(
+                            f'[{llm_provider}] returned an error response: "{response}"'
+                        )
+
+                    content = response["output"]["text"]
+                    return content.replace("\n", "")
+                else:
+                    raise Exception(
+                        f'[{llm_provider}] returned an invalid response: "{response}"'
+                    )
+            else:
+                raise Exception(f"[{llm_provider}] returned an empty response")
+
+        if llm_provider == "gemini":
+            import google.generativeai as genai
+
+            genai.configure(api_key=api_key, transport="rest")
+
+            safety_settings = {
+                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+            }
+
+            model = genai.GenerativeModel(
+                model_name=model_name,
+                safety_settings=safety_settings,
+            )
+
+            try:
+                response = model.generate_content(prompt)
+                return response.text
+            except Exception as err:
+                return handle_exception(err)
+
+        if llm_provider == "cloudflare":
+            import requests
+
+            response = requests.post(
+                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
+                headers={"Authorization": f"Bearer {api_key}"},
+                json={
+                    "messages": [
+                        {"role": "system", "content": "You are a friendly assistant"},
+                        {"role": "user", "content": prompt},
+                    ]
+                },
+            )
+            result = response.json()
+            logger.info(result)
+            return result["result"]["response"]
+
+        if llm_provider == "ernie":
+            import requests
+
+            params = {
+                "grant_type": "client_credentials",
+                "client_id": api_key,
+                "client_secret": secret_key,
+            }
+            access_token = (
+                requests.post("https://aip.baidubce.com/oauth/2.0/token", params=params)
+                .json()
+                .get("access_token")
+            )
+            url = f"{base_url}?access_token={access_token}"
+
+            payload = json.dumps(
+                {
+                    "messages": [{"role": "user", "content": prompt}],
+                    "temperature": 0.5,
+                    "top_p": 0.8,
+                    "penalty_score": 1,
+                    "disable_search": False,
+                    "enable_citation": False,
+                    "response_format": "text",
+                }
+            )
+            headers = {"Content-Type": "application/json"}
+
+            response = requests.request(
+                "POST", url, headers=headers, data=payload
+            ).json()
+            return response.get("result")
+
+        if llm_provider == "azure":
+            client = AzureOpenAI(
+                api_key=api_key,
+                api_version=api_version,
+                azure_endpoint=base_url,
+            )
+        else:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url,
+            )
+
+        response = client.chat.completions.create(
+            model=model_name, messages=[{"role": "user", "content": prompt}]
+        )
+        if response:
+            if isinstance(response, ChatCompletion):
+                content = response.choices[0].message.content
+            else:
+                raise Exception(
+                    f'[{llm_provider}] returned an invalid response: "{response}", please check your network '
+                    f"connection and try again."
+                )
+        else:
+            raise Exception(
+                f"[{llm_provider}] returned an empty response, please check your network connection and try again."
+            )
+
+    return content.replace("\n", "")
+
+
+def _generate_response_video(prompt: str, llm_provider_video: str, video_file: Union[str, TextIO]) -> str:
+    """
+    多模态能力大模型
+    """
+    if llm_provider_video == "gemini":
+        api_key = config.app.get("gemini_api_key")
+        model_name = config.app.get("gemini_model_name")
+        base_url = "***"
+    else:
+        raise ValueError(
+            "llm_provider 未设置，请在 config.toml 文件中进行设置。"
+        )
+
+    if llm_provider_video == "gemini":
+        import google.generativeai as genai
+
+        genai.configure(api_key=api_key, transport="rest")
+
+        safety_settings = {
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+
+        model = genai.GenerativeModel(
+            model_name=model_name,
+            safety_settings=safety_settings,
+        )
+
+        try:
+            response = model.generate_content([prompt, video_file])
+            return response.text
+        except Exception as err:
+            return handle_exception(err)
+
+
+def compress_video(input_path: str, output_path: str):
+    """
+    压缩视频文件
+    Args:
+        input_path: 输入视频文件路径
+        output_path: 输出压缩后的视频文件路径
+    """
+    # 如果压缩后的视频文件已经存在，则直接使用
+    if os.path.exists(output_path):
+        logger.info(f"压缩视频文件已存在: {output_path}")
+        return
+
+    try:
+        clip = VideoFileClip(input_path)
+        clip.write_videofile(output_path, codec='libx264', audio_codec='aac', bitrate="500k", audio_bitrate="128k")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"视频压缩失败: {e}")
+        raise
+
+
+def generate_script(
+    video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_callback=None
+) -> str:
+    """
+    生成视频剪辑脚本
+    Args:
+        video_path: 视频文件路径
+        video_plot: 视频剧情内容
+        video_name: 视频名称
+        language: 语言
+        progress_callback: 进度回调函数
+
+    Returns:
+        str: 生成的脚本
+    """
+    try:
+        # 1. 压缩视频
+        compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
+        compress_video(video_path, compressed_video_path)
+
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(15, "压缩完成")  # 例如,在压缩视频后
+
+        # 2. 转录视频
+        transcription = gemini_video_transcription(
+            video_name=video_name,
+            video_path=compressed_video_path,
+            language=language,
+            llm_provider_video=config.app["video_llm_provider"],
+            progress_callback=progress_callback
+        )
+        if progress_callback:
+            progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
+
+        # 3. 编写解说文案
+        script = writing_short_play(video_plot, video_name, config.app["llm_provider"], count=300)
+
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(70, "匹配画面...")  # 例如,在生成脚本后
+
+        # 4. 文案匹配画面
+        if transcription != "":
+            matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider=config.app["video_llm_provider"])
+            # 在关键步骤更新进度
+            if progress_callback:
+                progress_callback(80, "匹配成功")
+            return matched_script
+        else:
+            return ""
+    except Exception as e:
+        handle_exception(e)
+        raise
+
+
+def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_callback=None):
+    '''
+    使用 gemini-1.5-xxx 进行视频画面转录
+    '''
+    api_key = config.app.get("gemini_api_key")
+    gemini.configure(api_key=api_key)
+
+    prompt = """
+    请转录音频，包括时间戳，并提供视觉描述，然后以 JSON 格式输出，当前视频中使用的语言为 %s。
+    
+    在转录视频时，请通过确保以下条件来完成转录：
+    1. 画面描述使用语言: %s 进行输出。
+    2. 同一个画面合并为一个转录记录。
+    3. 使用以下 JSON schema:    
+        Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词，如果没有人说话，则使用空字符串。)}
+        Return: list[Graphics]
+    4. 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (language, language)
+
+    logger.debug(f"视频名称: {video_name}")
+    try:
+        if progress_callback:
+            progress_callback(20, "上传视频至 Google cloud")
+        gemini_video_file = gemini.upload_file(video_path)
+        logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...")
+        while gemini_video_file.state.name == "PROCESSING":
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            if progress_callback:
+                progress_callback(30, "上传成功, 开始解析")  # 更新进度为20%
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+        elif gemini_video_file.state.name == "ACTIVE":
+            if progress_callback:
+                progress_callback(40, "解析完成, 开始转录...")  # 更新进度为30%
+            logger.debug("解析完成, 开始转录...")
+    except ResumableUploadError as err:
+        logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
+        return False
+    except FailedPrecondition as err:
+        logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
+        return False
+
+    if progress_callback:
+        progress_callback(50, "开始转录")
+    try:
+        response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file)
+        logger.success("视频转录成功")
+        logger.debug(response)
+        print(type(response))
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+    prompt = f"""
+# Role: Video Search Terms Generator
+
+## Goals:
+Generate {amount} search terms for stock videos, depending on the subject of a video.
+
+## Constrains:
+1. the search terms are to be returned as a json-array of strings.
+2. each search term should consist of 1-3 words, always add the main subject of the video.
+3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
+4. the search terms must be related to the subject of the video.
+5. reply with english search terms only.
+
+## Output Example:
+["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
+
+## Context:
+### Video Subject
+{video_subject}
+
+### Video Script
+{video_script}
+
+Please note that you must use English for generating video search terms; Chinese is not accepted.
+""".strip()
+
+    logger.info(f"subject: {video_subject}")
+
+    search_terms = []
+    response = ""
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt)
+            search_terms = json.loads(response)
+            if not isinstance(search_terms, list) or not all(
+                isinstance(term, str) for term in search_terms
+            ):
+                logger.error("response is not a list of strings.")
+                continue
+
+        except Exception as e:
+            logger.warning(f"failed to generate video terms: {str(e)}")
+            if response:
+                match = re.search(r"\[.*]", response)
+                if match:
+                    try:
+                        search_terms = json.loads(match.group())
+                    except Exception as e:
+                        logger.warning(f"failed to generate video terms: {str(e)}")
+                        pass
+
+        if search_terms and len(search_terms) > 0:
+            break
+        if i < _max_retries:
+            logger.warning(f"failed to generate video terms, trying again... {i + 1}")
+
+    logger.success(f"completed: \n{search_terms}")
+    return search_terms
+
+
+def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot: str, language: str) -> str:
+    '''
+    使用 gemini-1.5-pro 进行影视解析
+    Args:
+        video_origin_name: str - 影视作品的原始名称
+        video_origin_path: str - 影视作品的原始路径
+        video_plot: str - 影视作品的简介或剧情概述
+
+    Return:
+        str - 解析后的 JSON 格式字符串
+    '''
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name=model_name)
+
+    prompt = """
+**角色设定：**  
+你是一位影视解说专家，擅长根据剧情生成引人入胜的短视频解说文案，特别熟悉适用于TikTok/抖音风格的快速、抓人视频解说。
+
+**任务目标：**  
+1. 根据给定剧情，详细描述画面，重点突出重要场景和情节。  
+2. 生成符合TikTok/抖音风格的解说，节奏紧凑，语言简洁，吸引观众。  
+3. 解说的时候需要解说一段播放一段原视频，原视频一般为有台词的片段，原视频的控制有 OST 字段控制。
+4. 结果输出为JSON格式，包含字段：  
+   - "picture"：画面描述  
+   - "timestamp"：画面出现的时间范围  
+   - "narration"：解说内容
+   - "OST": 是否开启原声（true / false）
+
+**输入示例：**  
+```text  
+在一个���暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。  
+```  
+
+**输出格式：**  
+```json  
+[  
+    {  
+        "picture": "黑暗的小巷，主角缓慢走入，四周安静，远处传来猫叫声。",  
+        "timestamp": "00:00-00:17",  
+        "narration": "静谧的小巷里，主角步步前行，气氛渐渐变得压抑。"  
+        "OST": False  
+    },  
+    {  
+        "picture": "神秘身影突然出现，紧张气氛加剧。",  
+        "timestamp": "00:17-00:39",  
+        "narration": "原声播放"  
+        "OST": True  
+    }  
+]  
+```  
+
+**提示：**  
+- 文案要简短有力，契合短视频平台用户的观赏习惯。  
+- 保持强烈的悬念和情感代入，吸引观众继续观看。  
+- 解说一段后播放一段原声，原声内容尽量和解说匹配。
+- 文案语言为：%s  
+- 剧情内容：%s (为空则忽略)  
+
+""" % (language, video_plot)
+
+    logger.debug(f"视频名称: {video_origin_name}")
+    # try:
+    gemini_video_file = gemini.upload_file(video_origin_path)
+    logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+    while gemini_video_file.state.name == "PROCESSING":
+        import time
+        time.sleep(1)
+        gemini_video_file = gemini.get_file(gemini_video_file.name)
+        logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+    if gemini_video_file.state.name == "FAILED":
+        raise ValueError(gemini_video_file.state.name)
+    # except Exception as err:
+    #     logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}")
+    #     raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}")
+
+    streams = model.generate_content([prompt, gemini_video_file], stream=True)
+    response = []
+    for chunk in streams:
+        response.append(chunk.text)
+
+    response = "".join(response)
+    logger.success(f"llm response: \n{response}")
+
+    return response
+
+
+def writing_movie(video_plot, video_name, llm_provider):
+    """
+    影视解说（电影解说）
+    """
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的影视解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部影视作品的名称，然后让你写一篇文案
+    请根据方法撰写 《{video_name}》的影视解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
+    
+    **任务目标：**  
+    1. 文案字数在 1500字左右，严格要求字数，最低不得少于 1000字。
+    2. 避免使用 markdown 格式输出文案。  
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \n 进行分隔。
+    """
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+def writing_short_play(video_plot: str, video_name: str, llm_provider: str, count: int = 500):
+    """
+    影视解说（短剧解说）
+    """
+    if not video_plot:
+        raise ValueError("短剧的简介不能为空")
+    if not video_name:
+        raise ValueError("短剧名称不能为空")
+
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的短剧解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部短剧作品的简介，然后让你写一篇解说文案
+    请根据方法撰写 《{video_name}》的解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
+
+    **任务目标：**  
+    1. 请严格要求文案字数, 字数控制在 {count} 字左右。
+    2. 避免使用 markdown 格式输出文案。
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \\n 进行分隔。
+    """
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+def screen_matching(huamian: str, wenan: str, llm_provider: str):
+    """
+    画面匹配（一次性匹配）
+    """
+    if not huamian:
+        raise ValueError("画面不能为空")
+    if not wenan:
+        raise ValueError("文案不能为空")
+
+    prompt = """
+    你是一名有10年经验的影视解说创作者，
+    你的任务是根据视频转录脚本和解说文案，匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。
+    
+    注意：
+    转录脚本中 
+        - timestamp: 表示视频时间戳
+        - picture: 表示当前画面描述
+        - speech": 表示当前视频中人物的台词
+    
+    转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
+    <PICTURE>
+    %s
+    </PICTURE>
+    
+    <COPYWRITER>
+    %s
+    </COPYWRITER>
+
+    在匹配的过程中，请通过确保以下条件来完成匹配：
+    - 使用以下 JSON schema:    
+        script = {'picture': str, 'timestamp': str(时间戳), "narration": str, "OST": bool(是否开启原声)}
+        Return: list[script]
+    - picture: 字段表示当前画面描述，与转录脚本保持一致
+    - timestamp: 字段表示某一段文案对应的画面的时间戳，不必和转录脚本的时间戳一致，应该充分考虑文案内容，匹配出与其描述最匹配的时间戳
+        - 请注意，请严格的执行已经出现的画面不能重复出现，即生成的脚本中 timestamp 不能有重叠的部分。
+    - narration: 字段表示需要解说文案，每段解说文案尽量不要超过30字
+    - OST: 字段表示是否开启原声，即当 OST 字段为 true 时，narration 字段为空字符串，当 OST 为 false 时，narration 字段为对应的解说文案
+    - 注意，在画面匹配的过程中，需要适当的加入原声播放，使得解说和画面更加匹配，请按照 1:1 的比例，生成原声和解说的脚本内容。
+    - 注意，在时间戳匹配上，一定不能原样照搬“转录脚本”，应当适当的合并或者删减一些片段。
+    - 注意，第一个画面一定是原声播放并且时长不少于 20 s，为了吸引观众，第一段一定是整个转录脚本中最精彩的片段。
+    - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (huamian, wenan)
+
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("匹配成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+if __name__ == "__main__":
+    # 1. 视频转录
+    video_subject = "第二十条之无罪释放"
+    video_path = "/Users/apple/Desktop/home/pipedream_project/downloads/jianzao.mp4"
+    language = "zh-CN"
+    gemini_video_transcription(
+        video_name=video_subject,
+        video_path=video_path,
+        language=language,
+        progress_callback=print,
+        llm_provider_video="gemini"
+    )
+
+    # # 2. 解说文案
+    # video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
+    # # video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4"
+    # video_plot = """
+    #     李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
+    # 走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
+    # 苏醒后的李牧坚决为父亲做无罪辩护，面对银行的顶级律师团队，他一个法学院大一学生，能否力挽狂澜，创作奇迹？挥法律之利剑 ，持正义之天平！
+    # """
+    # res = generate_script(video_path, video_plot, video_name="第二十条之无罪释放")
+    # # res = generate_script(video_path, video_plot, video_name="海岸")
+    # print("脚本生成成功:\n", res)
+    # res = clean_model_output(res)
+    # aaa = json.loads(res)
+    # print(json.dumps(aaa, indent=2, ensure_ascii=False))
diff --git a/app/services/material.py b/app/services/material.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a3c289cf8412bd854d1b08aaca17b129ed39254
--- /dev/null
+++ b/app/services/material.py
@@ -0,0 +1,561 @@
+import os
+import subprocess
+import random
+import traceback
+from urllib.parse import urlencode
+from datetime import datetime
+import json
+
+import requests
+from typing import List, Optional
+from loguru import logger
+from moviepy.video.io.VideoFileClip import VideoFileClip
+
+from app.config import config
+from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
+from app.utils import utils
+from app.utils import ffmpeg_utils
+
+requested_count = 0
+
+
+def get_api_key(cfg_key: str):
+    api_keys = config.app.get(cfg_key)
+    if not api_keys:
+        raise ValueError(
+            f"\n\n##### {cfg_key} is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n"
+            f"{utils.to_json(config.app)}"
+        )
+
+    # if only one key is provided, return it
+    if isinstance(api_keys, str):
+        return api_keys
+
+    global requested_count
+    requested_count += 1
+    return api_keys[requested_count % len(api_keys)]
+
+
+def search_videos_pexels(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+    video_orientation = aspect.name
+    video_width, video_height = aspect.to_resolution()
+    api_key = get_api_key("pexels_api_keys")
+    headers = {"Authorization": api_key}
+    # Build URL
+    params = {"query": search_term, "per_page": 20, "orientation": video_orientation}
+    query_url = f"https://api.pexels.com/videos/search?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url,
+            headers=headers,
+            proxies=config.proxy,
+            verify=False,
+            timeout=(30, 60),
+        )
+        response = r.json()
+        video_items = []
+        if "videos" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["videos"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["video_files"]
+            # loop through each url to determine the best quality
+            for video in video_files:
+                w = int(video["width"])
+                h = int(video["height"])
+                if w == video_width and h == video_height:
+                    item = MaterialInfo()
+                    item.provider = "pexels"
+                    item.url = video["link"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def search_videos_pixabay(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+
+    video_width, video_height = aspect.to_resolution()
+
+    api_key = get_api_key("pixabay_api_keys")
+    # Build URL
+    params = {
+        "q": search_term,
+        "video_type": "all",  # Accepted values: "all", "film", "animation"
+        "per_page": 50,
+        "key": api_key,
+    }
+    query_url = f"https://pixabay.com/api/videos/?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url, proxies=config.proxy, verify=False, timeout=(30, 60)
+        )
+        response = r.json()
+        video_items = []
+        if "hits" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["hits"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["videos"]
+            # loop through each url to determine the best quality
+            for video_type in video_files:
+                video = video_files[video_type]
+                w = int(video["width"])
+                h = int(video["height"])
+                if w >= video_width:
+                    item = MaterialInfo()
+                    item.provider = "pixabay"
+                    item.url = video["url"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def save_video(video_url: str, save_dir: str = "") -> str:
+    if not save_dir:
+        save_dir = utils.storage_dir("cache_videos")
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    url_without_query = video_url.split("?")[0]
+    url_hash = utils.md5(url_without_query)
+    video_id = f"vid-{url_hash}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+
+    # if video already exists, return the path
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"video already exists: {video_path}")
+        return video_path
+
+    # if video does not exist, download it
+    with open(video_path, "wb") as f:
+        f.write(
+            requests.get(
+                video_url, proxies=config.proxy, verify=False, timeout=(60, 240)
+            ).content
+        )
+
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return video_path
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                logger.warning(f"无效的视频文件: {video_path} => {str(e)}")
+    return ""
+
+
+def download_videos(
+    task_id: str,
+    search_terms: List[str],
+    source: str = "pexels",
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
+    audio_duration: float = 0.0,
+    max_clip_duration: int = 5,
+) -> List[str]:
+    valid_video_items = []
+    valid_video_urls = []
+    found_duration = 0.0
+    search_videos = search_videos_pexels
+    if source == "pixabay":
+        search_videos = search_videos_pixabay
+
+    for search_term in search_terms:
+        video_items = search_videos(
+            search_term=search_term,
+            minimum_duration=max_clip_duration,
+            video_aspect=video_aspect,
+        )
+        logger.info(f"found {len(video_items)} videos for '{search_term}'")
+
+        for item in video_items:
+            if item.url not in valid_video_urls:
+                valid_video_items.append(item)
+                valid_video_urls.append(item.url)
+                found_duration += item.duration
+
+    logger.info(
+        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds"
+    )
+    video_paths = []
+
+    material_directory = config.app.get("material_directory", "").strip()
+    if material_directory == "task":
+        material_directory = utils.task_dir(task_id)
+    elif material_directory and not os.path.isdir(material_directory):
+        material_directory = ""
+
+    if video_contact_mode.value == VideoConcatMode.random.value:
+        random.shuffle(valid_video_items)
+
+    total_duration = 0.0
+    for item in valid_video_items:
+        try:
+            logger.info(f"downloading video: {item.url}")
+            saved_video_path = save_video(
+                video_url=item.url, save_dir=material_directory
+            )
+            if saved_video_path:
+                logger.info(f"video saved: {saved_video_path}")
+                video_paths.append(saved_video_path)
+                seconds = min(max_clip_duration, item.duration)
+                total_duration += seconds
+                if total_duration > audio_duration:
+                    logger.info(
+                        f"total duration of downloaded videos: {total_duration} seconds, skip downloading more"
+                    )
+                    break
+        except Exception as e:
+            logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}")
+    logger.success(f"downloaded {len(video_paths)} videos")
+    return video_paths
+
+
+def time_to_seconds(time_str: str) -> float:
+    """
+    将时间字符串转换为秒数
+    支持格式: 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+
+    Args:
+        time_str: 时间字符串,如 "00:00:20,100"
+
+    Returns:
+        float: 转换后的秒数(包含毫秒)
+    """
+    try:
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = int(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 处理时分秒
+        parts = time_part.split(':')
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        else:
+            raise ValueError("时间格式必须为 HH:MM:SS,mmm")
+
+        return seconds + ms
+
+    except ValueError as e:
+        logger.error(f"时间格式错误: {time_str}")
+        raise ValueError(f"时间格式错误: 必须为 HH:MM:SS,mmm 格式") from e
+
+
+def format_timestamp(seconds: float) -> str:
+    """
+    将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
+
+    Args:
+        seconds: 秒数(可包含毫秒)
+
+    Returns:
+        str: 格式化的时间字符串,如 "00:00:20,100"
+    """
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+
+
+def _detect_hardware_acceleration() -> Optional[str]:
+    """
+    检测系统可用的硬件加速器
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用集中式硬件加速检测
+    hwaccel_type = ffmpeg_utils.get_ffmpeg_hwaccel_type()
+    return hwaccel_type
+
+
+def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str:
+    """
+    保存剪辑后的视频
+
+    Args:
+        timestamp: 需要裁剪的时间戳,格式为 'HH:MM:SS,mmm-HH:MM:SS,mmm'
+                  例如: '00:00:00,000-00:00:20,100'
+        origin_video: 原视频路径
+        save_dir: 存储目录
+
+    Returns:
+        dict: 裁剪后的视频路径,格式为 {timestamp: video_path}
+    """
+    # 使用新的路径结构
+    if not save_dir:
+        base_dir = os.path.join(utils.temp_dir(), "clip_video")
+        video_hash = utils.md5(origin_video)
+        save_dir = os.path.join(base_dir, video_hash)
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    # 解析时间戳
+    start_str, end_str = timestamp.split('-')
+
+    # 格式化输出文件名（使用连字符替代冒号和逗号）
+    safe_start_time = start_str.replace(':', '-').replace(',', '-')
+    safe_end_time = end_str.replace(':', '-').replace(',', '-')
+    output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+    video_path = os.path.join(save_dir, output_filename)
+
+    # 如果视频已存在，直接返回
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"视频已存在: {video_path}")
+        return video_path
+
+    try:
+        # 检查视频是否存在
+        if not os.path.exists(origin_video):
+            logger.error(f"源视频文件不存在: {origin_video}")
+            return ''
+
+        # 获取视频总时长
+        try:
+            probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+                        "-of", "default=noprint_wrappers=1:nokey=1", origin_video]
+            total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip())
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频时长失败: {str(e)}")
+            return ''
+
+        # 计算时间点
+        start = time_to_seconds(start_str)
+        end = time_to_seconds(end_str)
+
+        # 验证时间段
+        if start >= total_duration:
+            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
+            return ''
+
+        if end > total_duration:
+            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
+            end = total_duration
+
+        if end <= start:
+            logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}")
+            return ''
+
+        # 计算剪辑时长
+        duration = end - start
+        # logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
+
+        # 获取硬件加速选项
+        hwaccel = _detect_hardware_acceleration()
+        hwaccel_args = []
+        if hwaccel:
+            hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_str.replace(',', '.')
+        ffmpeg_end_time = end_str.replace(',', '.')
+
+        # 构建FFmpeg命令
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", origin_video,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
+            "-c:a", "aac",
+            "-strict", "experimental",
+            video_path
+        ]
+
+        # 执行FFmpeg命令
+        # logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+        # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+
+        # 在Windows系统上使用UTF-8编码处理输出，避免GBK编码错误
+        is_windows = os.name == 'nt'
+        if is_windows:
+            process = subprocess.run(
+                ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                encoding='utf-8',  # 明确指定编码为UTF-8
+                text=True,
+                check=False  # 不抛出异常，我们会检查返回码
+            )
+        else:
+            process = subprocess.run(
+                ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=False  # 不抛出异常，我们会检查返回码
+            )
+
+        # 检查是否成功
+        if process.returncode != 0:
+            logger.error(f"视频剪辑失败: {process.stderr}")
+            if os.path.exists(video_path):
+                os.remove(video_path)
+            return ''
+
+        # 验证生成的视频文件
+        if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+            # 检查视频是否可播放
+            probe_cmd = ["ffprobe", "-v", "error", video_path]
+            # 在Windows系统上使用UTF-8编码
+            if is_windows:
+                validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
+            else:
+                validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            if validate_result.returncode == 0:
+                logger.info(f"视频剪辑成功: {video_path}")
+                return video_path
+
+        logger.error("视频文件验证失败")
+        if os.path.exists(video_path):
+            os.remove(video_path)
+        return ''
+
+    except Exception as e:
+        logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}")
+        if os.path.exists(video_path):
+            os.remove(video_path)
+        return ''
+
+
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
+    """
+    剪辑视频
+    Args:
+        task_id: 任务id
+        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00:00,000-00:00:20,100', '00:00:43,039-00:00:46,959']
+        origin_video: 原视频路径
+        progress_callback: 进度回调函数
+
+    Returns:
+        剪辑后的视频路径
+    """
+    video_paths = {}
+    total_items = len(timestamp_terms)
+    for index, item in enumerate(timestamp_terms):
+        material_directory = config.app.get("material_directory", "").strip()
+        try:
+            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
+            if saved_video_path:
+                video_paths.update({index+1:saved_video_path})
+
+            # 更新进度
+            if progress_callback:
+                progress_callback(index + 1, total_items)
+        except Exception as e:
+            logger.error(f"视频裁剪失败: {utils.to_json(item)} =>\n{str(traceback.format_exc())}")
+            return {}
+
+    logger.success(f"裁剪 {len(video_paths)} videos")
+    # logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False))
+    return video_paths
+
+
+def merge_videos(video_paths, ost_list):
+    """
+    合并多个视频为一个视频，可选择是否保留每个视频的原声。
+
+    :param video_paths: 视频文件路径列表
+    :param ost_list: 是否保留原声的布尔值列表
+    :return: 合并后的视频文件路径
+    """
+    if len(video_paths) != len(ost_list):
+        raise ValueError("视频路径列表和保留原声列表长度必须相同")
+
+    if not video_paths:
+        raise ValueError("视频路径列表不能为空")
+
+    # 准备临时文件列表
+    temp_file = "temp_file_list.txt"
+    with open(temp_file, "w") as f:
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if keep_ost:
+                f.write(f"file '{video_path}'\n")
+            else:
+                # 如果不保留原声，创建一个无声的临时视频
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                subprocess.run(["ffmpeg", "-i", video_path, "-c:v", "copy", "-an", silent_video], check=True)
+                f.write(f"file '{silent_video}'\n")
+
+    # 合并视频
+    output_file = "combined.mp4"
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", temp_file,
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-strict", "experimental",
+        output_file
+    ]
+
+    try:
+        subprocess.run(ffmpeg_cmd, check=True)
+        print(f"视频合并成功：{output_file}")
+    except subprocess.CalledProcessError as e:
+        print(f"视频合并失败：{e}")
+        return None
+    finally:
+        # 清理临时文件
+        os.remove(temp_file)
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if not keep_ost:
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                if os.path.exists(silent_video):
+                    os.remove(silent_video)
+
+    return output_file
diff --git a/app/services/merger_video.py b/app/services/merger_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d688bf40326fbc409247c373f4972d42b15156c
--- /dev/null
+++ b/app/services/merger_video.py
@@ -0,0 +1,662 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : merger_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午7:38
+'''
+
+import os
+import shutil
+import subprocess
+from enum import Enum
+from typing import List, Optional, Tuple
+from loguru import logger
+
+from app.utils import ffmpeg_utils
+
+
+class VideoAspect(Enum):
+    """视频宽高比枚举"""
+    landscape = "16:9"  # 横屏 16:9
+    landscape_2 = "4:3"
+    portrait = "9:16"   # 竖屏 9:16
+    portrait_2 = "3:4"
+    square = "1:1"      # 方形 1:1
+
+    def to_resolution(self) -> Tuple[int, int]:
+        """根据宽高比返回标准分辨率"""
+        if self == VideoAspect.portrait:
+            return 1080, 1920  # 竖屏 9:16
+        elif self == VideoAspect.portrait_2:
+            return 720, 1280   # 竖屏 4:3
+        elif self == VideoAspect.landscape:
+            return 1920, 1080  # 横屏 16:9
+        elif self == VideoAspect.landscape_2:
+            return 1280, 720   # 横屏 4:3
+        elif self == VideoAspect.square:
+            return 1080, 1080  # 方形 1:1
+        else:
+            return 1080, 1920  # 默认竖屏
+
+
+def check_ffmpeg_installation() -> bool:
+    """
+    检查ffmpeg是否已安装
+
+    Returns:
+        bool: 如果安装则返回True，否则返回False
+    """
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        logger.error("ffmpeg未安装或不在系统PATH中，请安装ffmpeg")
+        return False
+
+
+def get_hardware_acceleration_option() -> Optional[str]:
+    """
+    根据系统环境选择合适的硬件加速选项
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用集中式硬件加速检测
+    return ffmpeg_utils.get_ffmpeg_hwaccel_type()
+
+
+def check_video_has_audio(video_path: str) -> bool:
+    """
+    检查视频是否包含音频流
+
+    Args:
+        video_path: 视频文件路径
+
+    Returns:
+        bool: 如果视频包含音频流则返回True，否则返回False
+    """
+    if not os.path.exists(video_path):
+        logger.warning(f"视频文件不存在: {video_path}")
+        return False
+
+    probe_cmd = [
+        'ffprobe', '-v', 'error',
+        '-select_streams', 'a:0',
+        '-show_entries', 'stream=codec_type',
+        '-of', 'csv=p=0',
+        video_path
+    ]
+
+    try:
+        result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
+        return result.stdout.strip() == 'audio'
+    except Exception as e:
+        logger.warning(f"检测视频音频流时出错: {str(e)}")
+        return False
+
+
+def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str:
+    """
+    创建ffmpeg合并所需的concat文件
+
+    Args:
+        video_paths: 需要合并的视频文件路径列表
+        concat_file_path: concat文件的输出路径
+
+    Returns:
+        str: concat文件的路径
+    """
+    with open(concat_file_path, 'w', encoding='utf-8') as f:
+        for video_path in video_paths:
+            # 获取绝对路径
+            abs_path = os.path.abspath(video_path)
+            # 在Windows上将反斜杠替换为正斜杠
+            if os.name == 'nt':  # Windows系统
+                abs_path = abs_path.replace('\\', '/')
+            else:  # Unix/Mac系统
+                # 转义特殊字符
+                abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:')
+
+            # 处理路径中的单引号 (如果有)
+            abs_path = abs_path.replace("'", "\\'")
+
+            f.write(f"file '{abs_path}'\n")
+    return concat_file_path
+
+
+def process_single_video(
+        input_path: str,
+        output_path: str,
+        target_width: int,
+        target_height: int,
+        keep_audio: bool = True,
+        hwaccel: Optional[str] = None
+) -> str:
+    """
+    处理单个视频：调整分辨率、帧率等
+
+    Args:
+        input_path: 输入视频路径
+        output_path: 输出视频路径
+        target_width: 目标宽度
+        target_height: 目标高度
+        keep_audio: 是否保留音频
+        hwaccel: 硬件加速选项
+
+    Returns:
+        str: 处理后的视频路径
+    """
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"找不到视频文件: {input_path}")
+
+    # 构建基本命令
+    command = ['ffmpeg', '-y']
+
+    # 安全检查：如果在Windows上，则慎用硬件加速
+    is_windows = os.name == 'nt'
+    if is_windows and hwaccel:
+        logger.info("在Windows系统上检测到硬件加速请求，将进行额外的兼容性检查")
+        try:
+            # 对视频进行快速探测，检测其基本信息
+            probe_cmd = [
+                'ffprobe', '-v', 'error',
+                '-select_streams', 'v:0',
+                '-show_entries', 'stream=codec_name,width,height',
+                '-of', 'csv=p=0',
+                input_path
+            ]
+            result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
+
+            # 如果探测成功，使用硬件加速；否则降级到软件编码
+            if result.returncode != 0:
+                logger.warning(f"视频探测失败，为安全起见，禁用硬件加速: {result.stderr}")
+                hwaccel = None
+        except Exception as e:
+            logger.warning(f"视频探测出错，禁用硬件加速: {str(e)}")
+            hwaccel = None
+
+    # 添加硬件加速参数（根据前面的安全检查可能已经被禁用）
+    if hwaccel:
+        try:
+            # 使用集中式硬件加速参数
+            hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+            command.extend(hwaccel_args)
+        except Exception as e:
+            logger.warning(f"应用硬件加速参数时出错: {str(e)}，将使用软件编码")
+            # 重置命令，移除可能添加了一半的硬件加速参数
+            command = ['ffmpeg', '-y']
+
+    # 输入文件
+    command.extend(['-i', input_path])
+
+    # 处理音频
+    if not keep_audio:
+        command.extend(['-an'])  # 移除音频
+    else:
+        # 检查输入视频是否有音频流
+        has_audio = check_video_has_audio(input_path)
+        if has_audio:
+            command.extend(['-c:a', 'aac', '-b:a', '128k'])  # 音频编码为AAC
+        else:
+            logger.warning(f"视频 {input_path} 没有音频流，将会忽略音频设置")
+            command.extend(['-an'])  # 没有音频流时移除音频设置
+
+    # 视频处理参数：缩放并添加填充以保持比例
+    scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease"
+    pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2"
+    command.extend([
+        '-vf', f"{scale_filter},{pad_filter}",
+        '-r', '30',  # 设置帧率为30fps
+    ])
+
+    # 选择编码器 - 考虑到Windows和特定硬件的兼容性
+    use_software_encoder = True
+
+    if hwaccel:
+        # 获取硬件加速类型和编码器信息
+        hwaccel_type = ffmpeg_utils.get_ffmpeg_hwaccel_type()
+        hwaccel_encoder = ffmpeg_utils.get_ffmpeg_hwaccel_encoder()
+
+        if hwaccel_type == 'cuda' or hwaccel_type == 'nvenc':
+            try:
+                # 检查NVENC编码器是否可用
+                encoders_cmd = subprocess.run(
+                    ["ffmpeg", "-hide_banner", "-encoders"],
+                    stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+                )
+
+                if "h264_nvenc" in encoders_cmd.stdout.lower():
+                    command.extend(['-c:v', 'h264_nvenc', '-preset', 'p4', '-profile:v', 'high'])
+                    use_software_encoder = False
+                else:
+                    logger.warning("NVENC编码器不可用，将使用软件编码")
+            except Exception as e:
+                logger.warning(f"NVENC编码器检测失败: {str(e)}，将使用软件编码")
+        elif hwaccel_type == 'qsv':
+            command.extend(['-c:v', 'h264_qsv', '-preset', 'medium'])
+            use_software_encoder = False
+        elif hwaccel_type == 'videotoolbox':  # macOS
+            command.extend(['-c:v', 'h264_videotoolbox', '-profile:v', 'high'])
+            use_software_encoder = False
+        elif hwaccel_type == 'vaapi':  # Linux VA-API
+            command.extend(['-c:v', 'h264_vaapi', '-profile', '100'])
+            use_software_encoder = False
+
+    # 如果前面的条件未能应用硬件编码器，使用软件编码
+    if use_software_encoder:
+        logger.info("使用软件编码器(libx264)")
+        command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high'])
+
+    # 设置视频比特率和其他参数
+    command.extend([
+        '-b:v', '5M',
+        '-maxrate', '8M',
+        '-bufsize', '10M',
+        '-pix_fmt', 'yuv420p',  # 兼容性更好的颜色格式
+    ])
+
+    # 输出文件
+    command.append(output_path)
+
+    # 执行命令
+    try:
+        # logger.info(f"执行FFmpeg命令: {' '.join(command)}")
+        process = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        logger.info(f"视频处理成功: {output_path}")
+        return output_path
+    except subprocess.CalledProcessError as e:
+        error_msg = e.stderr.decode() if e.stderr else str(e)
+        logger.error(f"处理视频失败: {error_msg}")
+
+        # 如果使用硬件加速失败，尝试使用软件编码
+        if hwaccel:
+            logger.info("尝试使用软件编码作为备选方案")
+            try:
+                # 构建新的命令，使用软件编码
+                fallback_cmd = ['ffmpeg', '-y', '-i', input_path]
+
+                # 保持原有的音频设置
+                if not keep_audio:
+                    fallback_cmd.extend(['-an'])
+                else:
+                    has_audio = check_video_has_audio(input_path)
+                    if has_audio:
+                        fallback_cmd.extend(['-c:a', 'aac', '-b:a', '128k'])
+                    else:
+                        fallback_cmd.extend(['-an'])
+
+                # 保持原有的视频过滤器
+                fallback_cmd.extend([
+                    '-vf', f"{scale_filter},{pad_filter}",
+                    '-r', '30',
+                    '-c:v', 'libx264',
+                    '-preset', 'medium',
+                    '-profile:v', 'high',
+                    '-b:v', '5M',
+                    '-maxrate', '8M',
+                    '-bufsize', '10M',
+                    '-pix_fmt', 'yuv420p',
+                    output_path
+                ])
+
+                logger.info(f"执行备选FFmpeg命令: {' '.join(fallback_cmd)}")
+                subprocess.run(fallback_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                logger.info(f"使用软件编码成功处理视频: {output_path}")
+                return output_path
+            except subprocess.CalledProcessError as fallback_error:
+                fallback_error_msg = fallback_error.stderr.decode() if fallback_error.stderr else str(fallback_error)
+                logger.error(f"备选软件编码也失败: {fallback_error_msg}")
+                raise RuntimeError(f"无法处理视频 {input_path}: 硬件加速和软件编码都失败")
+
+        # 如果不是硬件加速导致的问题，或者备选方案也失败了，抛出原始错误
+        raise RuntimeError(f"处理视频失败: {error_msg}")
+
+
+def combine_clip_videos(
+        output_video_path: str,
+        video_paths: List[str],
+        video_ost_list: List[int],
+        video_aspect: VideoAspect = VideoAspect.portrait,
+        threads: int = 4,
+        force_software_encoding: bool = False,  # 新参数，强制使用软件编码
+) -> str:
+    """
+    合并子视频
+    Args:
+        output_video_path: 合并后的存储路径
+        video_paths: 子视频路径列表
+        video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
+        video_aspect: 屏幕比例
+        threads: 线程数
+        force_software_encoding: 是否强制使用软件编码（忽略硬件加速检测）
+
+    Returns:
+        str: 合并后的视频路径
+    """
+    # 检查ffmpeg是否安装
+    if not check_ffmpeg_installation():
+        raise RuntimeError("未找到ffmpeg，请先安装")
+
+    # 准备输出目录
+    output_dir = os.path.dirname(output_video_path)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 获取目标分辨率
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    # 检测可用的硬件加速选项
+    hwaccel = None if force_software_encoding else get_hardware_acceleration_option()
+    if hwaccel:
+        logger.info(f"将使用 {hwaccel} 硬件加速")
+    elif force_software_encoding:
+        logger.info("已强制使用软件编码，跳过硬件加速检测")
+    else:
+        logger.info("未检测到兼容的硬件加速，将使用软件编码")
+
+    # Windows系统上，默认使用软件编码以提高兼容性
+    if os.name == 'nt' and hwaccel:
+        logger.warning("在Windows系统上检测到硬件加速，但为了提高兼容性，建议使用软件编码")
+        # 不强制禁用hwaccel，而是在process_single_video中进行额外安全检查
+
+    # 重组视频路径和原声设置为一个字典列表结构
+    video_segments = []
+
+    # 检查视频路径和原声设置列表长度是否匹配
+    if len(video_paths) != len(video_ost_list):
+        logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配")
+        # 调整长度以匹配较短的列表
+        min_length = min(len(video_paths), len(video_ost_list))
+        video_paths = video_paths[:min_length]
+        video_ost_list = video_ost_list[:min_length]
+
+    # 创建视频处理配置字典列表
+    for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)):
+        if not os.path.exists(video_path):
+            logger.warning(f"视频不存在，跳过: {video_path}")
+            continue
+
+        # 检查是否有音频流
+        has_audio = check_video_has_audio(video_path)
+
+        # 构建视频片段配置
+        segment = {
+            "index": i,
+            "path": video_path,
+            "ost": video_ost,
+            "has_audio": has_audio,
+            "keep_audio": video_ost > 0 and has_audio  # 只有当ost>0且实际有音频时才保留
+        }
+
+        # 记录日志
+        if video_ost > 0 and not has_audio:
+            logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost})，但该视频没有音频流")
+
+        video_segments.append(segment)
+
+    # 处理每个视频片段
+    processed_videos = []
+    temp_dir = os.path.join(output_dir, "temp_videos")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    try:
+        # 第一阶段：处理所有视频片段到中间文件
+        for segment in video_segments:
+            # 处理单个视频，去除或保留音频
+            temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4")
+            try:
+                process_single_video(
+                    input_path=segment['path'],
+                    output_path=temp_output,
+                    target_width=video_width,
+                    target_height=video_height,
+                    keep_audio=segment['keep_audio'],
+                    hwaccel=hwaccel
+                )
+                processed_videos.append({
+                    "index": segment["index"],
+                    "path": temp_output,
+                    "keep_audio": segment["keep_audio"]
+                })
+                logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成")
+            except Exception as e:
+                logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}")
+                # 如果使用硬件加速失败，尝试使用软件编码
+                if hwaccel and not force_software_encoding:
+                    logger.info(f"尝试使用软件编码处理视频 {segment['path']}")
+                    try:
+                        process_single_video(
+                            input_path=segment['path'],
+                            output_path=temp_output,
+                            target_width=video_width,
+                            target_height=video_height,
+                            keep_audio=segment['keep_audio'],
+                            hwaccel=None  # 使用软件编码
+                        )
+                        processed_videos.append({
+                            "index": segment["index"],
+                            "path": temp_output,
+                            "keep_audio": segment["keep_audio"]
+                        })
+                        logger.info(f"使用软件编码成功处理视频 {segment['index'] + 1}/{len(video_segments)}")
+                    except Exception as fallback_error:
+                        logger.error(f"使用软件编码处理视频 {segment['path']} 也失败: {str(fallback_error)}")
+                        continue
+                else:
+                    continue
+
+        if not processed_videos:
+            raise ValueError("没有有效的视频片段可以合并")
+
+        # 按原始索引排序处理后的视频
+        processed_videos.sort(key=lambda x: x["index"])
+
+        # 第二阶段：分步骤合并视频 - 避免复杂的filter_complex滤镜
+        try:
+            # 1. 首先，将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中
+            video_paths_only = [video["path"] for video in processed_videos]
+            video_concat_path = os.path.join(temp_dir, "video_concat.mp4")
+
+            # 创建concat文件，用于合并视频流
+            concat_file = os.path.join(temp_dir, "concat_list.txt")
+            create_ffmpeg_concat_file(video_paths_only, concat_file)
+
+            # 合并所有视频流，但不包含音频
+            concat_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'concat',
+                '-safe', '0',
+                '-i', concat_file,
+                '-c:v', 'libx264',
+                '-preset', 'medium',
+                '-profile:v', 'high',
+                '-an',  # 不包含音频
+                '-threads', str(threads),
+                video_concat_path
+            ]
+
+            subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频流合并完成")
+
+            # 2. 提取并合并有音频的片段
+            audio_segments = [video for video in processed_videos if video["keep_audio"]]
+
+            if not audio_segments:
+                # 如果没有音频片段，直接使用无音频的合并视频作为最终结果
+                shutil.copy(video_concat_path, output_video_path)
+                logger.info("无音频视频合并完成")
+                return output_video_path
+
+            # 创建音频中间文件
+            audio_files = []
+            for i, segment in enumerate(audio_segments):
+                # 提取音频
+                audio_file = os.path.join(temp_dir, f"audio_{i}.aac")
+                extract_audio_cmd = [
+                    'ffmpeg', '-y',
+                    '-i', segment["path"],
+                    '-vn',  # 不包含视频
+                    '-c:a', 'aac',
+                    '-b:a', '128k',
+                    audio_file
+                ]
+                subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                audio_files.append({
+                    "index": segment["index"],
+                    "path": audio_file
+                })
+                logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成")
+
+            # 3. 计算每个音频片段的时间位置
+            audio_timings = []
+            current_time = 0.0
+
+            # 获取每个视频片段的时长
+            for i, video in enumerate(processed_videos):
+                duration_cmd = [
+                    'ffprobe', '-v', 'error',
+                    '-show_entries', 'format=duration',
+                    '-of', 'csv=p=0',
+                    video["path"]
+                ]
+                result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                duration = float(result.stdout.strip())
+
+                # 如果当前片段需要保留音频，记录时间位置
+                if video["keep_audio"]:
+                    for audio in audio_files:
+                        if audio["index"] == video["index"]:
+                            audio_timings.append({
+                                "file": audio["path"],
+                                "start": current_time,
+                                "index": video["index"]
+                            })
+                            break
+
+                current_time += duration
+
+            # 4. 创建静音音频轨道作为基础
+            silence_audio = os.path.join(temp_dir, "silence.aac")
+            create_silence_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'lavfi',
+                '-i', f'anullsrc=r=44100:cl=stereo',
+                '-t', str(current_time),  # 总时长
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                silence_audio
+            ]
+            subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            # 5. 创建复杂滤镜命令以混合音频
+            filter_script = os.path.join(temp_dir, "filter_script.txt")
+            with open(filter_script, 'w') as f:
+                f.write(f"[0:a]volume=0.0[silence];\n")  # 首先静音背景轨道
+
+                # 添加每个音频文件
+                for i, timing in enumerate(audio_timings):
+                    f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n")
+
+                # 混合所有音频
+                mix_str = "[silence]"
+                for i in range(len(audio_timings)):
+                    mix_str += f"[a{i}]"
+                mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]"
+                f.write(mix_str)
+
+            # 6. 构建音频合并命令
+            audio_inputs = ['-i', silence_audio]
+            for timing in audio_timings:
+                audio_inputs.extend(['-i', timing["file"]])
+
+            mixed_audio = os.path.join(temp_dir, "mixed_audio.aac")
+            audio_mix_cmd = [
+                'ffmpeg', '-y'
+            ] + audio_inputs + [
+                '-filter_complex_script', filter_script,
+                '-map', '[aout]',
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                mixed_audio
+            ]
+
+            subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("音频混合完成")
+
+            # 7. 将合并的视频和混合的音频组合在一起
+            final_cmd = [
+                'ffmpeg', '-y',
+                '-i', video_concat_path,
+                '-i', mixed_audio,
+                '-c:v', 'copy',
+                '-c:a', 'aac',
+                '-map', '0:v:0',
+                '-map', '1:a:0',
+                '-shortest',
+                output_video_path
+            ]
+
+            subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频最终合并完成")
+
+            return output_video_path
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}")
+
+            # 尝试备用合并方法 - 最简单的无音频合并
+            logger.info("尝试备用合并方法 - 无音频合并")
+            try:
+                concat_file = os.path.join(temp_dir, "concat_list.txt")
+                video_paths_only = [video["path"] for video in processed_videos]
+                create_ffmpeg_concat_file(video_paths_only, concat_file)
+
+                backup_cmd = [
+                    'ffmpeg', '-y',
+                    '-f', 'concat',
+                    '-safe', '0',
+                    '-i', concat_file,
+                    '-c:v', 'copy',
+                    '-an',  # 无音频
+                    output_video_path
+                ]
+
+                subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                logger.warning("使用备用方法（无音频）成功合并视频")
+                return output_video_path
+            except Exception as backup_error:
+                logger.error(f"备用合并方法也失败: {str(backup_error)}")
+                raise RuntimeError(f"无法合并视频: {str(backup_error)}")
+
+    except Exception as e:
+        logger.error(f"合并视频时出错: {str(e)}")
+        raise
+    finally:
+        # 清理临时文件
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+                logger.info("已清理临时文件")
+        except Exception as e:
+            logger.warning(f"清理临时文件时出错: {str(e)}")
+
+
+if __name__ == '__main__':
+    video_paths = [
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E02_00_14_09_440.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_27_11_110.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_34_44_480.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_42_47_630.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E09_00_29_48_160.mp4'
+        ]
+
+    combine_clip_videos(
+        output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4",
+        video_paths=video_paths,
+        video_ost_list=[1, 1, 1,1,1],
+        video_aspect=VideoAspect.portrait,
+        force_software_encoding=False  # 默认不强制使用软件编码，让系统自动决定
+    )
diff --git a/app/services/script_service.py b/app/services/script_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..461978bd1bda355b0b099bf47715ee15dece56f6
--- /dev/null
+++ b/app/services/script_service.py
@@ -0,0 +1,400 @@
+import os
+import json
+import time
+import asyncio
+import requests
+from app.utils import video_processor
+from loguru import logger
+from typing import List, Dict, Any, Callable
+
+from app.utils import utils, gemini_analyzer, video_processor
+from app.utils.script_generator import ScriptProcessor
+from app.config import config
+
+
+class ScriptGenerator:
+    def __init__(self):
+        self.temp_dir = utils.temp_dir()
+        self.keyframes_dir = os.path.join(self.temp_dir, "keyframes")
+        
+    async def generate_script(
+        self,
+        video_path: str,
+        video_theme: str = "",
+        custom_prompt: str = "",
+        frame_interval_input: int = 5,
+        skip_seconds: int = 0,
+        threshold: int = 30,
+        vision_batch_size: int = 5,
+        vision_llm_provider: str = "gemini",
+        progress_callback: Callable[[float, str], None] = None
+    ) -> List[Dict[Any, Any]]:
+        """
+        生成视频脚本的核心逻辑
+        
+        Args:
+            video_path: 视频文件路径
+            video_theme: 视频主题
+            custom_prompt: 自定义提示词
+            skip_seconds: 跳过开始的秒数
+            threshold: 差异���值
+            vision_batch_size: 视觉处理批次大小
+            vision_llm_provider: 视觉模型提供商
+            progress_callback: 进度回调函数
+            
+        Returns:
+            List[Dict]: 生成的视频脚本
+        """
+        if progress_callback is None:
+            progress_callback = lambda p, m: None
+            
+        try:
+            # 提取关键帧
+            progress_callback(10, "正在提取关键帧...")
+            keyframe_files = await self._extract_keyframes(
+                video_path, 
+                skip_seconds,
+                threshold
+            )
+            
+            if vision_llm_provider == "gemini":
+                script = await self._process_with_gemini(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            elif vision_llm_provider == "narratoapi":
+                script = await self._process_with_narrato(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            else:
+                raise ValueError(f"Unsupported vision provider: {vision_llm_provider}")
+                
+            return json.loads(script) if isinstance(script, str) else script
+            
+        except Exception as e:
+            logger.exception("Generate script failed")
+            raise
+            
+    async def _extract_keyframes(
+        self,
+        video_path: str,
+        skip_seconds: int,
+        threshold: int
+    ) -> List[str]:
+        """提取视频关键帧"""
+        video_hash = utils.md5(video_path + str(os.path.getmtime(video_path)))
+        video_keyframes_dir = os.path.join(self.keyframes_dir, video_hash)
+        
+        # 检查缓存
+        keyframe_files = []
+        if os.path.exists(video_keyframes_dir):
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            if keyframe_files:
+                logger.info(f"Using cached keyframes: {video_keyframes_dir}")
+                return keyframe_files
+                
+        # 提取新的关键帧
+        os.makedirs(video_keyframes_dir, exist_ok=True)
+        
+        try:
+            processor = video_processor.VideoProcessor(video_path)
+            processor.process_video_pipeline(
+                output_dir=video_keyframes_dir,
+                skip_seconds=skip_seconds,
+                threshold=threshold
+            )
+
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            return keyframe_files
+            
+        except Exception as e:
+            if os.path.exists(video_keyframes_dir):
+                import shutil
+                shutil.rmtree(video_keyframes_dir)
+            raise
+            
+    async def _process_with_gemini(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用Gemini处理视频帧"""
+        progress_callback(30, "正在初始化视觉分析器...")
+        
+        # 获取Gemini配置
+        vision_api_key = config.app.get("vision_gemini_api_key")
+        vision_model = config.app.get("vision_gemini_model_name")
+        
+        if not vision_api_key or not vision_model:
+            raise ValueError("未配置 Gemini API Key 或者模型")
+
+        analyzer = gemini_analyzer.VisionAnalyzer(
+            model_name=vision_model,
+            api_key=vision_api_key,
+        )
+
+        progress_callback(40, "正在分析关键帧...")
+
+        # 执行异步分析
+        results = await analyzer.analyze_images(
+            images=keyframe_files,
+            prompt=config.app.get('vision_analysis_prompt'),
+            batch_size=vision_batch_size
+        )
+
+        progress_callback(60, "正在整理分析结果...")
+        
+        # 合并所有批次的分析结果
+        frame_analysis = ""
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
+                continue
+                
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            # 添加带时间戳的分��结果
+            frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+            frame_analysis += result['response']
+            frame_analysis += "\n"
+            
+            prev_batch_files = batch_files
+        
+        if not frame_analysis.strip():
+            raise Exception("未能生成有效的帧分析结果")
+        
+        progress_callback(70, "正在生成脚本...")
+
+        # 构建帧内容列表
+        frame_content_list = []
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                continue
+            
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            _, _, timestamp_range = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            frame_content = {
+                "timestamp": timestamp_range,
+                "picture": result['response'],
+                "narration": "",
+                "OST": 2
+            }
+            frame_content_list.append(frame_content)
+            prev_batch_files = batch_files
+
+        if not frame_content_list:
+            raise Exception("没有有效的帧内容可以处理")
+
+        progress_callback(90, "正在生成文案...")
+        
+        # 获取文本生��配置
+        text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+        text_api_key = config.app.get(f'text_{text_provider}_api_key')
+        text_model = config.app.get(f'text_{text_provider}_model_name')
+
+        processor = ScriptProcessor(
+            model_name=text_model,
+            api_key=text_api_key,
+            prompt=custom_prompt,
+            video_theme=video_theme
+        )
+
+        return processor.process_frames(frame_content_list)
+
+    async def _process_with_narrato(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用NarratoAPI处理视频帧"""
+        # 创建临时目录
+        temp_dir = utils.temp_dir("narrato")
+        
+        # 打包关键帧
+        progress_callback(30, "正在打包关键帧...")
+        zip_path = os.path.join(temp_dir, f"keyframes_{int(time.time())}.zip")
+        
+        try:
+            if not utils.create_zip(keyframe_files, zip_path):
+                raise Exception("打包关键帧失败")
+            
+            # 获取API配置
+            api_url = config.app.get("narrato_api_url")
+            api_key = config.app.get("narrato_api_key")
+            
+            if not api_key:
+                raise ValueError("未配置 Narrato API Key")
+            
+            headers = {
+                'X-API-Key': api_key,
+                'accept': 'application/json'
+            }
+            
+            api_params = {
+                'batch_size': vision_batch_size,
+                'use_ai': False,
+                'start_offset': 0,
+                'vision_model': config.app.get('narrato_vision_model', 'gemini-1.5-flash'),
+                'vision_api_key': config.app.get('narrato_vision_key'),
+                'llm_model': config.app.get('narrato_llm_model', 'qwen-plus'),
+                'llm_api_key': config.app.get('narrato_llm_key'),
+                'custom_prompt': custom_prompt
+            }
+            
+            progress_callback(40, "正在上传文件...")
+            with open(zip_path, 'rb') as f:
+                files = {'file': (os.path.basename(zip_path), f, 'application/x-zip-compressed')}
+                response = requests.post(
+                    f"{api_url}/video/analyze",
+                    headers=headers, 
+                    params=api_params, 
+                    files=files,
+                    timeout=30
+                )
+                response.raise_for_status()
+            
+            task_data = response.json()
+            task_id = task_data["data"].get('task_id')
+            if not task_id:
+                raise Exception(f"无效的API��应: {response.text}")
+            
+            progress_callback(50, "正在等待分析结果...")
+            retry_count = 0
+            max_retries = 60
+            
+            while retry_count < max_retries:
+                try:
+                    status_response = requests.get(
+                        f"{api_url}/video/tasks/{task_id}",
+                        headers=headers,
+                        timeout=10
+                    )
+                    status_response.raise_for_status()
+                    task_status = status_response.json()['data']
+                    
+                    if task_status['status'] == 'SUCCESS':
+                        return task_status['result']['data']
+                    elif task_status['status'] in ['FAILURE', 'RETRY']:
+                        raise Exception(f"任务失败: {task_status.get('error')}")
+                    
+                    retry_count += 1
+                    time.sleep(2)
+                    
+                except requests.RequestException as e:
+                    logger.warning(f"获取任务状态失败，重试中: {str(e)}")
+                    retry_count += 1
+                    time.sleep(2)
+                    continue
+            
+            raise Exception("任务执行超时")
+            
+        finally:
+            # 清理临时文件
+            try:
+                if os.path.exists(zip_path):
+                    os.remove(zip_path)
+            except Exception as e:
+                logger.warning(f"清理临时文件失败: {str(e)}")
+
+    def _get_batch_files(
+        self, 
+        keyframe_files: List[str], 
+        result: Dict[str, Any], 
+        batch_size: int
+    ) -> List[str]:
+        """获取当前批次的图片文件"""
+        batch_start = result['batch_index'] * batch_size
+        batch_end = min(batch_start + batch_size, len(keyframe_files))
+        return keyframe_files[batch_start:batch_end]
+
+    def _get_batch_timestamps(
+        self, 
+        batch_files: List[str], 
+        prev_batch_files: List[str] = None
+    ) -> tuple[str, str, str]:
+        """获取一批文件的时间戳范围，支持毫秒级精度"""
+        if not batch_files:
+            logger.warning("Empty batch files")
+            return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000"
+            
+        if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0:
+            first_frame = os.path.basename(prev_batch_files[-1])
+            last_frame = os.path.basename(batch_files[0])
+        else:
+            first_frame = os.path.basename(batch_files[0])
+            last_frame = os.path.basename(batch_files[-1])
+        
+        first_time = first_frame.split('_')[2].replace('.jpg', '')
+        last_time = last_frame.split('_')[2].replace('.jpg', '')
+        
+        def format_timestamp(time_str: str) -> str:
+            """将时间字符串转换为 HH:MM:SS,mmm 格式"""
+            try:
+                if len(time_str) < 4:
+                    logger.warning(f"Invalid timestamp format: {time_str}")
+                    return "00:00:00,000"
+                
+                # 处理毫秒部分
+                if ',' in time_str:
+                    time_part, ms_part = time_str.split(',')
+                    ms = int(ms_part)
+                else:
+                    time_part = time_str
+                    ms = 0
+                
+                # 处理时分秒
+                parts = time_part.split(':')
+                if len(parts) == 3:  # HH:MM:SS
+                    h, m, s = map(int, parts)
+                elif len(parts) == 2:  # MM:SS
+                    h = 0
+                    m, s = map(int, parts)
+                else:  # SS
+                    h = 0
+                    m = 0
+                    s = int(parts[0])
+                    
+                # 处理进位
+                if s >= 60:
+                    m += s // 60
+                    s = s % 60
+                if m >= 60:
+                    h += m // 60
+                    m = m % 60
+                    
+                return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+                
+            except Exception as e:
+                logger.error(f"时间戳格式转换错误 {time_str}: {str(e)}")
+                return "00:00:00,000"
+        
+        first_timestamp = format_timestamp(first_time)
+        last_timestamp = format_timestamp(last_time)
+        timestamp_range = f"{first_timestamp}-{last_timestamp}"
+        
+        return first_timestamp, last_timestamp, timestamp_range
\ No newline at end of file
diff --git a/app/services/state.py b/app/services/state.py
new file mode 100644
index 0000000000000000000000000000000000000000..51904fb7d726222fd3aea5708d51f13c6985563c
--- /dev/null
+++ b/app/services/state.py
@@ -0,0 +1,122 @@
+import ast
+from abc import ABC, abstractmethod
+from app.config import config
+from app.models import const
+
+
+# Base class for state management
+class BaseState(ABC):
+    @abstractmethod
+    def update_task(self, task_id: str, state: int, progress: int = 0, **kwargs):
+        pass
+
+    @abstractmethod
+    def get_task(self, task_id: str):
+        pass
+
+
+# Memory state management
+class MemoryState(BaseState):
+    def __init__(self):
+        self._tasks = {}
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        self._tasks[task_id] = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+    def get_task(self, task_id: str):
+        return self._tasks.get(task_id, None)
+
+    def delete_task(self, task_id: str):
+        if task_id in self._tasks:
+            del self._tasks[task_id]
+
+
+# Redis state management
+class RedisState(BaseState):
+    def __init__(self, host="localhost", port=6379, db=0, password=None):
+        import redis
+
+        self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        fields = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+        for field, value in fields.items():
+            self._redis.hset(task_id, field, str(value))
+
+    def get_task(self, task_id: str):
+        task_data = self._redis.hgetall(task_id)
+        if not task_data:
+            return None
+
+        task = {
+            key.decode("utf-8"): self._convert_to_original_type(value)
+            for key, value in task_data.items()
+        }
+        return task
+
+    def delete_task(self, task_id: str):
+        self._redis.delete(task_id)
+
+    @staticmethod
+    def _convert_to_original_type(value):
+        """
+        Convert the value from byte string to its original data type.
+        You can extend this method to handle other data types as needed.
+        """
+        value_str = value.decode("utf-8")
+
+        try:
+            # try to convert byte string array to list
+            return ast.literal_eval(value_str)
+        except (ValueError, SyntaxError):
+            pass
+
+        if value_str.isdigit():
+            return int(value_str)
+        # Add more conversions here if needed
+        return value_str
+
+
+# Global state
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+
+state = (
+    RedisState(
+        host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password
+    )
+    if _enable_redis
+    else MemoryState()
+)
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
new file mode 100644
index 0000000000000000000000000000000000000000..c443c3fd71cbe7a13dec113cb28aa8e908f2289a
--- /dev/null
+++ b/app/services/subtitle.py
@@ -0,0 +1,462 @@
+import json
+import os.path
+import re
+import traceback
+from typing import Optional
+
+# from faster_whisper import WhisperModel
+from timeit import default_timer as timer
+from loguru import logger
+import google.generativeai as genai
+from moviepy import VideoFileClip
+import os
+
+from app.config import config
+from app.utils import utils
+
+model_size = config.whisper.get("model_size", "faster-whisper-large-v2")
+device = config.whisper.get("device", "cpu")
+compute_type = config.whisper.get("compute_type", "int8")
+model = None
+
+
+def create(audio_file, subtitle_file: str = ""):
+    """
+    为给定的音频文件创建字幕文件。
+
+    参数:
+    - audio_file: 音频文件的路径。
+    - subtitle_file: 字幕文件的输出路径（可选）。如果未提供，将根据音频文件的路径生成字幕文件。
+
+    返回:
+    无返回值，但会在指定路径生成字幕文件。
+    """
+    global model, device, compute_type
+    if not model:
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3"
+        model_bin_file = f"{model_path}/model.bin"
+        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
+            logger.error(
+                "请先下载 whisper 模型\n\n"
+                "********************************************\n"
+                "下载地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2\n"
+                "存放路径：app/models \n"
+                "********************************************\n"
+            )
+            return None
+
+        # 首先使用CPU模式，不触发CUDA检查
+        use_cuda = False
+        try:
+            # 在函数中延迟导入torch，而不是在全局范围内
+            # 使用安全的方式检查CUDA可用性
+            def check_cuda_available():
+                try:
+                    import torch
+                    return torch.cuda.is_available()
+                except (ImportError, RuntimeError) as e:
+                    logger.warning(f"检查CUDA可用性时出错: {e}")
+                    return False
+                
+            # 仅当明确需要时才检查CUDA
+            use_cuda = check_cuda_available()
+            
+            if use_cuda:
+                logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
+                try:
+                    model = WhisperModel(
+                        model_size_or_path=model_path,
+                        device="cuda",
+                        compute_type="float16",
+                        local_files_only=True
+                    )
+                    device = "cuda"
+                    compute_type = "float16"
+                    logger.info("成功使用 CUDA 加载模型")
+                except Exception as e:
+                    logger.warning(f"CUDA 加载失败，错误信息: {str(e)}")
+                    logger.warning("回退到 CPU 模式")
+                    use_cuda = False
+            else:
+                logger.info("使用 CPU 模式")
+        except Exception as e:
+            logger.warning(f"CUDA检查过程出错: {e}")
+            logger.warning("默认使用CPU模式")
+            use_cuda = False
+
+        # 如果CUDA不可用或加载失败，使用CPU
+        if not use_cuda:
+            device = "cpu"
+            compute_type = "int8"
+            logger.info(f"使用 CPU 加载模型: {model_path}")
+            model = WhisperModel(
+                model_size_or_path=model_path,
+                device=device,
+                compute_type=compute_type,
+                local_files_only=True
+            )
+
+        logger.info(f"模型加载完成，使用设备: {device}, 计算类型: {compute_type}")
+
+    logger.info(f"start, output file: {subtitle_file}")
+    if not subtitle_file:
+        subtitle_file = f"{audio_file}.srt"
+
+    segments, info = model.transcribe(
+        audio_file,
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+        initial_prompt="以下是普通话的句子"
+    )
+
+    logger.info(
+        f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}"
+    )
+
+    start = timer()
+    subtitles = []
+
+    def recognized(seg_text, seg_start, seg_end):
+        seg_text = seg_text.strip()
+        if not seg_text:
+            return
+
+        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
+        logger.debug(msg)
+
+        subtitles.append(
+            {"msg": seg_text, "start_time": seg_start, "end_time": seg_end}
+        )
+
+    for segment in segments:
+        words_idx = 0
+        words_len = len(segment.words)
+
+        seg_start = 0
+        seg_end = 0
+        seg_text = ""
+
+        if segment.words:
+            is_segmented = False
+            for word in segment.words:
+                if not is_segmented:
+                    seg_start = word.start
+                    is_segmented = True
+
+                seg_end = word.end
+                # 如果包含标点,则断句
+                seg_text += word.word
+
+                if utils.str_contains_punctuation(word.word):
+                    # remove last char
+                    seg_text = seg_text[:-1]
+                    if not seg_text:
+                        continue
+
+                    recognized(seg_text, seg_start, seg_end)
+
+                    is_segmented = False
+                    seg_text = ""
+
+                if words_idx == 0 and segment.start < word.start:
+                    seg_start = word.start
+                if words_idx == (words_len - 1) and segment.end > word.end:
+                    seg_end = word.end
+                words_idx += 1
+
+        if not seg_text:
+            continue
+
+        recognized(seg_text, seg_start, seg_end)
+
+    end = timer()
+
+    diff = end - start
+    logger.info(f"complete, elapsed: {diff:.2f} s")
+
+    idx = 1
+    lines = []
+    for subtitle in subtitles:
+        text = subtitle.get("msg")
+        if text:
+            lines.append(
+                utils.text_to_srt(
+                    idx, text, subtitle.get("start_time"), subtitle.get("end_time")
+                )
+            )
+            idx += 1
+
+    sub = "\n".join(lines) + "\n"
+    with open(subtitle_file, "w", encoding="utf-8") as f:
+        f.write(sub)
+    logger.info(f"subtitle file created: {subtitle_file}")
+
+
+def file_to_subtitles(filename):
+    """
+    将字幕文件转换为字幕列表。
+
+    参数:
+    filename (str): 字幕文件的路径。
+
+    返回:
+    list: 包含字幕序号、出现时间、和字幕文本的元组列表。
+    """
+    if not filename or not os.path.isfile(filename):
+        return []
+
+    times_texts = []
+    current_times = None
+    current_text = ""
+    index = 0
+    with open(filename, "r", encoding="utf-8") as f:
+        for line in f:
+            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
+            if times:
+                current_times = line
+            elif line.strip() == "" and current_times:
+                index += 1
+                times_texts.append((index, current_times.strip(), current_text.strip()))
+                current_times, current_text = None, ""
+            elif current_times:
+                current_text += line
+    return times_texts
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+
+    if len(s2) == 0:
+        return len(s1)
+
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def similarity(a, b):
+    distance = levenshtein_distance(a.lower(), b.lower())
+    max_length = max(len(a), len(b))
+    return 1 - (distance / max_length)
+
+
+def correct(subtitle_file, video_script):
+    subtitle_items = file_to_subtitles(subtitle_file)
+    script_lines = utils.split_string_by_punctuations(video_script)
+
+    corrected = False
+    new_subtitle_items = []
+    script_index = 0
+    subtitle_index = 0
+
+    while script_index < len(script_lines) and subtitle_index < len(subtitle_items):
+        script_line = script_lines[script_index].strip()
+        subtitle_line = subtitle_items[subtitle_index][2].strip()
+
+        if script_line == subtitle_line:
+            new_subtitle_items.append(subtitle_items[subtitle_index])
+            script_index += 1
+            subtitle_index += 1
+        else:
+            combined_subtitle = subtitle_line
+            start_time = subtitle_items[subtitle_index][1].split(" --> ")[0]
+            end_time = subtitle_items[subtitle_index][1].split(" --> ")[1]
+            next_subtitle_index = subtitle_index + 1
+
+            while next_subtitle_index < len(subtitle_items):
+                next_subtitle = subtitle_items[next_subtitle_index][2].strip()
+                if similarity(
+                    script_line, combined_subtitle + " " + next_subtitle
+                ) > similarity(script_line, combined_subtitle):
+                    combined_subtitle += " " + next_subtitle
+                    end_time = subtitle_items[next_subtitle_index][1].split(" --> ")[1]
+                    next_subtitle_index += 1
+                else:
+                    break
+
+            if similarity(script_line, combined_subtitle) > 0.8:
+                logger.warning(
+                    f"Merged/Corrected - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+            else:
+                logger.warning(
+                    f"Mismatch - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+
+            script_index += 1
+            subtitle_index = next_subtitle_index
+
+    # 处理剩余的脚本行
+    while script_index < len(script_lines):
+        logger.warning(f"Extra script line: {script_lines[script_index]}")
+        if subtitle_index < len(subtitle_items):
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    subtitle_items[subtitle_index][1],
+                    script_lines[script_index],
+                )
+            )
+            subtitle_index += 1
+        else:
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    "00:00:00,000 --> 00:00:00,000",
+                    script_lines[script_index],
+                )
+            )
+        script_index += 1
+        corrected = True
+
+    if corrected:
+        with open(subtitle_file, "w", encoding="utf-8") as fd:
+            for i, item in enumerate(new_subtitle_items):
+                fd.write(f"{i + 1}\n{item[1]}\n{item[2]}\n\n")
+        logger.info("Subtitle corrected")
+    else:
+        logger.success("Subtitle is correct")
+
+
+def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]:
+    if not api_key:
+        logger.error("Gemini API key is not provided")
+        return None
+
+    genai.configure(api_key=api_key)
+
+    logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}")
+    
+    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+    prompt = "生成这段语音的转录文本。请以SRT格式输出，包含时间戳。"
+
+    try:
+        with open(audio_file, "rb") as f:
+            audio_data = f.read()
+        
+        response = model.generate_content([prompt, audio_data])
+        transcript = response.text
+
+        if not subtitle_file:
+            subtitle_file = f"{audio_file}.srt"
+
+        with open(subtitle_file, "w", encoding="utf-8") as f:
+            f.write(transcript)
+
+        logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}")
+        return subtitle_file
+    except Exception as e:
+        logger.error(f"使用Gemini处理音频时出错: {e}")
+        return None
+
+
+def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") -> Optional[str]:
+    """
+    从视频文件中提取音频并生成字幕文件。
+
+    参数:
+    - video_file: MP4视频文件的路径
+    - subtitle_file: 输出字幕文件的路径（可选）。如果未提供，将根据视频文件名自动生成。
+
+    返回:
+    - str: 生成的字幕文件路径
+    - None: 如果处理过程中出现错误
+    """
+    try:
+        # 获取视频文件所在目录
+        video_dir = os.path.dirname(video_file)
+        video_name = os.path.splitext(os.path.basename(video_file))[0]
+        
+        # 设置音频文件路径
+        audio_file = os.path.join(video_dir, f"{video_name}_audio.wav")
+        
+        # 如果未指定字幕文件路径，则自动生成
+        if not subtitle_file:
+            subtitle_file = os.path.join(video_dir, f"{video_name}.srt")
+        
+        logger.info(f"开始从视频提取音频: {video_file}")
+        
+        # 加载视频文件
+        video = VideoFileClip(video_file)
+        
+        # 提取音频并保存为WAV格式
+        logger.info(f"正在提取音频到: {audio_file}")
+        video.audio.write_audiofile(audio_file, codec='pcm_s16le')
+        
+        # 关闭视频文件
+        video.close()
+        
+        logger.info("音频提取完成，开始生成字幕")
+        
+        # 使用create函数生成字幕
+        create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file)
+        
+        # 删除临时音频文件
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+            logger.info("已清理临时音频文件")
+        
+        return subtitle_file
+        
+    except Exception as e:
+        logger.error(f"处理视频文件时出错: {str(e)}")
+        logger.error(traceback.format_exc())
+        return None
+
+
+if __name__ == "__main__":
+    task_id = "123456"
+    task_dir = utils.task_dir(task_id)
+    subtitle_file = f"{task_dir}/subtitle_123456.srt"
+    audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav"
+    video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4"
+
+    extract_audio_and_create_subtitle(video_file, subtitle_file)
+
+    # subtitles = file_to_subtitles(subtitle_file)
+    # print(subtitles)
+
+    # # script_file = f"{task_dir}/script.json"
+    # # with open(script_file, "r") as f:
+    # #     script_content = f.read()
+    # # s = json.loads(script_content)
+    # # script = s.get("script")
+    # #
+    # # correct(subtitle_file, script)
+
+    # subtitle_file = f"{task_dir}/subtitle111.srt"
+    # create(audio_file, subtitle_file)
+
+    # # # 使用Gemini模型处理音频
+    # # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    # #
+    # # if gemini_subtitle_file:
+    # #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
diff --git a/app/services/subtitle_merger.py b/app/services/subtitle_merger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9097586296b383b2d790b9bcf45784e9b3bc8c68
--- /dev/null
+++ b/app/services/subtitle_merger.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : subtitle_merger
+@Author : viccy
+@Date   : 2025/5/6 下午4:00 
+'''
+
+import re
+import os
+from datetime import datetime, timedelta
+
+
+def parse_time(time_str):
+    """解析时间字符串为timedelta对象"""
+    hours, minutes, seconds_ms = time_str.split(':')
+    seconds, milliseconds = seconds_ms.split(',')
+    
+    td = timedelta(
+        hours=int(hours),
+        minutes=int(minutes),
+        seconds=int(seconds),
+        milliseconds=int(milliseconds)
+    )
+    return td
+
+
+def format_time(td):
+    """将timedelta对象格式化为SRT时间字符串"""
+    total_seconds = int(td.total_seconds())
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    milliseconds = td.microseconds // 1000
+    
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+
+def parse_edited_time_range(time_range_str):
+    """从editedTimeRange字符串中提取时间范围"""
+    if not time_range_str:
+        return None, None
+    
+    parts = time_range_str.split('-')
+    if len(parts) != 2:
+        return None, None
+    
+    start_time_str, end_time_str = parts
+    
+    # 将HH:MM:SS格式转换为timedelta
+    start_h, start_m, start_s = map(int, start_time_str.split(':'))
+    end_h, end_m, end_s = map(int, end_time_str.split(':'))
+    
+    start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s)
+    end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s)
+    
+    return start_time, end_time
+
+
+def merge_subtitle_files(subtitle_items, output_file=None):
+    """
+    合并多个SRT字幕文件
+    
+    参数:
+        subtitle_items: 字典列表，每个字典包含subtitle文件路径和editedTimeRange
+        output_file: 输出文件的路径，如果为None则自动生成
+    
+    返回:
+        合并后的字幕文件路径
+    """
+    # 按照editedTimeRange的开始时间排序
+    sorted_items = sorted(subtitle_items, 
+                         key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta())
+    
+    merged_subtitles = []
+    subtitle_index = 1
+    
+    for item in sorted_items:
+        if not item.get('subtitle') or not os.path.exists(item.get('subtitle')):
+            continue
+            
+        # 从editedTimeRange获取起始时间偏移
+        offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', ''))
+        
+        if offset_time is None:
+            print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围，跳过该项")
+            continue
+        
+        with open(item['subtitle'], 'r', encoding='utf-8') as file:
+            content = file.read()
+            
+        # 解析字幕文件
+        subtitle_blocks = re.split(r'\n\s*\n', content.strip())
+        
+        for block in subtitle_blocks:
+            lines = block.strip().split('\n')
+            if len(lines) < 3:  # 确保块有足够的行数
+                continue
+                
+            # 解析时间轴行
+            time_line = lines[1]
+            time_parts = time_line.split(' --> ')
+            if len(time_parts) != 2:
+                continue
+                
+            start_time = parse_time(time_parts[0])
+            end_time = parse_time(time_parts[1])
+            
+            # 应用时间偏移
+            adjusted_start_time = start_time + offset_time
+            adjusted_end_time = end_time + offset_time
+            
+            # 重建字幕块
+            adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}"
+            text_lines = lines[2:]
+            
+            new_block = [
+                str(subtitle_index),
+                adjusted_time_line,
+                *text_lines
+            ]
+            
+            merged_subtitles.append('\n'.join(new_block))
+            subtitle_index += 1
+    
+    # 确定输出文件路径
+    if output_file is None:
+        dir_path = os.path.dirname(sorted_items[0]['subtitle'])
+        first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0]
+        last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1]
+        
+        first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60)
+        last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60)
+        
+        first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}"
+        last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}"
+        
+        output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt")
+    
+    # 合并所有字幕块
+    merged_content = '\n\n'.join(merged_subtitles)
+    
+    # 写入合并后的内容
+    with open(output_file, 'w', encoding='utf-8') as file:
+        file.write(merged_content)
+    
+    return output_file
+
+
+if __name__ == '__main__':
+    # 测试数据
+    test_data = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！', 
+         'timestamp': '00:00:00-00:01:15', 
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！', 
+         'OST': 0, 
+         '_id': 1, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 
+         'sourceTimeRange': '00:00:00-00:00:26', 
+         'duration': 26, 
+         'editedTimeRange': '00:00:00-00:00:26'
+        },
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 
+         'timestamp': '00:01:15-00:04:40', 
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…', 
+         'OST': 0, 
+         '_id': 2, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 
+         'sourceTimeRange': '00:01:15-00:01:29', 
+         'duration': 14, 
+         'editedTimeRange': '00:00:26-00:00:40'
+        },
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。', 
+         'timestamp': '00:04:58-00:05:45', 
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！', 
+         'OST': 0, 
+         '_id': 4, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 
+         'sourceTimeRange': '00:04:58-00:05:20', 
+         'duration': 22, 
+         'editedTimeRange': '00:00:57-00:01:19'
+        },
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'timestamp': '00:05:45-00:06:00', 
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'OST': 0, 
+         '_id': 5, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', 
+         'sourceTimeRange': '00:05:45-00:05:53', 
+         'duration': 8, 
+         'editedTimeRange': '00:01:19-00:01:27'
+        }
+    ]
+    
+    output_file = merge_subtitle_files(test_data)
+    print(f"字幕文件已合并至: {output_file}")
diff --git a/app/services/task.py b/app/services/task.py
new file mode 100644
index 0000000000000000000000000000000000000000..c257d396e773e2179b3794e4fdd9e182779f5409
--- /dev/null
+++ b/app/services/task.py
@@ -0,0 +1,398 @@
+import math
+import json
+import os.path
+import re
+import traceback
+from os import path
+from loguru import logger
+
+from app.config import config
+from app.models import const
+from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
+from app.services import (llm, material, subtitle, video, voice, audio_merger,
+                          subtitle_merger, clip_video, merger_video, update_script, generate_video)
+from app.services import state as sm
+from app.utils import utils
+
+
+# def generate_script(task_id, params):
+#     logger.info("\n\n## generating video script")
+#     video_script = params.video_script.strip()
+#     if not video_script:
+#         video_script = llm.generate_script(
+#             video_subject=params.video_subject,
+#             language=params.video_language,
+#             paragraph_number=params.paragraph_number,
+#         )
+#     else:
+#         logger.debug(f"video script: \n{video_script}")
+
+#     if not video_script:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video script.")
+#         return None
+
+#     return video_script
+
+
+# def generate_terms(task_id, params, video_script):
+#     logger.info("\n\n## generating video terms")
+#     video_terms = params.video_terms
+#     if not video_terms:
+#         video_terms = llm.generate_terms(
+#             video_subject=params.video_subject, video_script=video_script, amount=5
+#         )
+#     else:
+#         if isinstance(video_terms, str):
+#             video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
+#         elif isinstance(video_terms, list):
+#             video_terms = [term.strip() for term in video_terms]
+#         else:
+#             raise ValueError("video_terms must be a string or a list of strings.")
+
+#         logger.debug(f"video terms: {utils.to_json(video_terms)}")
+
+#     if not video_terms:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video terms.")
+#         return None
+
+#     return video_terms
+
+
+# def save_script_data(task_id, video_script, video_terms, params):
+#     script_file = path.join(utils.task_dir(task_id), "script.json")
+#     script_data = {
+#         "script": video_script,
+#         "search_terms": video_terms,
+#         "params": params,
+#     }
+
+#     with open(script_file, "w", encoding="utf-8") as f:
+#         f.write(utils.to_json(script_data))
+
+
+# def generate_audio(task_id, params, video_script):
+#     logger.info("\n\n## generating audio")
+#     audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+#     sub_maker = voice.tts(
+#         text=video_script,
+#         voice_name=voice.parse_voice_name(params.voice_name),
+#         voice_rate=params.voice_rate,
+#         voice_file=audio_file,
+#     )
+#     if sub_maker is None:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error(
+#             """failed to generate audio:
+# 1. check if the language of the voice matches the language of the video script.
+# 2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+#         """.strip()
+#         )
+#         return None, None, None
+
+#     audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+#     return audio_file, audio_duration, sub_maker
+
+
+# def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+#     if not params.subtitle_enabled:
+#         return ""
+
+#     subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
+#     subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+#     logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+
+#     subtitle_fallback = False
+#     if subtitle_provider == "edge":
+#         voice.create_subtitle(
+#             text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+#         )
+#         if not os.path.exists(subtitle_path):
+#             subtitle_fallback = True
+#             logger.warning("subtitle file not found, fallback to whisper")
+
+#     if subtitle_provider == "whisper" or subtitle_fallback:
+#         subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+#         logger.info("\n\n## correcting subtitle")
+#         subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+
+#     subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+#     if not subtitle_lines:
+#         logger.warning(f"subtitle file is invalid: {subtitle_path}")
+#         return ""
+
+#     return subtitle_path
+
+
+# def get_video_materials(task_id, params, video_terms, audio_duration):
+#     if params.video_source == "local":
+#         logger.info("\n\n## preprocess local materials")
+#         materials = video.preprocess_video(
+#             materials=params.video_materials, clip_duration=params.video_clip_duration
+#         )
+#         if not materials:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "no valid materials found, please check the materials and try again."
+#             )
+#             return None
+#         return [material_info.url for material_info in materials]
+#     else:
+#         logger.info(f"\n\n## downloading videos from {params.video_source}")
+#         downloaded_videos = material.download_videos(
+#             task_id=task_id,
+#             search_terms=video_terms,
+#             source=params.video_source,
+#             video_aspect=params.video_aspect,
+#             video_contact_mode=params.video_concat_mode,
+#             audio_duration=audio_duration * params.video_count,
+#             max_clip_duration=params.video_clip_duration,
+#         )
+#         if not downloaded_videos:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+#             )
+#             return None
+#         return downloaded_videos
+
+
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
+    """
+    后台任务（自动剪辑视频进行剪辑）
+    Args:
+        task_id: 任务ID
+        params: 视频参数
+        subclip_path_videos: 视频片段路径
+    """
+    global merged_audio_path, merged_subtitle_path
+
+    logger.info(f"\n\n## 开始任务: {task_id}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)
+
+    # # 初始化 ImageMagick
+    # if not utils.init_imagemagick():
+    #     logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
+
+    # # tts 角色名称
+    # voice_name = voice.parse_voice_name(params.voice_name)
+    """
+    1. 加载剪辑脚本
+    """
+    logger.info("\n\n## 1. 加载视频脚本")
+    video_script_path = path.join(params.video_clip_json_path)
+    
+    if path.exists(video_script_path):
+        try:
+            with open(video_script_path, "r", encoding="utf-8") as f:
+                list_script = json.load(f)
+                video_list = [i['narration'] for i in list_script]
+                video_ost = [i['OST'] for i in list_script]
+                time_list = [i['timestamp'] for i in list_script]
+
+                video_script = " ".join(video_list)
+                logger.debug(f"解说完整脚本: \n{video_script}")
+                logger.debug(f"解说 OST 列表: \n{video_ost}")
+                logger.debug(f"解说时间戳列表: \n{time_list}")
+        except Exception as e:
+            logger.error(f"无法读取视频json脚本，请检查脚本格式是否正确")
+            raise ValueError("无法读取视频json脚本，请检查脚本格式是否正确")
+    else:
+        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
+        raise ValueError("解说脚本不存在！请检查配置是否正确。")
+
+    """
+    2. 使用 TTS 生成音频素材
+    """
+    logger.info("\n\n## 2. 根据OST设置生成音频列表")
+    # 只为OST=0 or 2的判断生成音频， OST=0 仅保留解说 OST=2 保留解说和原声
+    tts_segments = [
+        segment for segment in list_script 
+        if segment['OST'] in [0, 2]
+    ]
+    logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
+
+    tts_results = voice.tts_multiple(
+        task_id=task_id,
+        list_script=tts_segments,  # 只传入需要TTS的片段
+        voice_name=params.voice_name,
+        voice_rate=params.voice_rate,
+        voice_pitch=params.voice_pitch,
+    )
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # """
+    # 3. (可选) 使用 whisper 生成字幕
+    # """
+    # if merged_subtitle_path is None:
+    #     if audio_files:
+    #         merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+    #         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    #         logger.info(f"\n\n使用 {subtitle_provider} 生成字幕")
+    #
+    #         subtitle.create(
+    #             audio_file=merged_audio_path,
+    #             subtitle_file=merged_subtitle_path,
+    #         )
+    #         subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path)
+    #         if not subtitle_lines:
+    #             logger.warning(f"字幕文件无效: {merged_subtitle_path}")
+    #
+    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    """
+    3. 裁剪视频 - 将超出音频长度的视频进行裁剪
+    """
+    logger.info("\n\n## 3. 裁剪视频")
+    video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results)
+    # 更新 list_script 中的时间戳
+    tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
+    subclip_clip_result = {
+        tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
+    }
+    new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
+
+    """
+    4. 合并音频和字幕
+    """
+    logger.info("\n\n## 4. 合并音频和字幕")
+    total_duration = sum([script["duration"] for script in new_script_list])
+    if tts_segments:
+        try:
+            # 合并音频文件
+            merged_audio_path = audio_merger.merge_audio_files(
+                task_id=task_id,
+                total_duration=total_duration,
+                list_script=new_script_list
+            )
+            logger.info(f"音频文件合并成功->{merged_audio_path}")
+            # 合并字幕文件
+            merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
+            logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
+        except Exception as e:
+            logger.error(f"合并音频文件失败: {str(e)}")
+    else:
+        logger.warning("没有需要合并的音频/字幕")
+        merged_audio_path = ""
+        merged_subtitle_path = ""
+
+    """
+    5. 合并视频
+    """
+    final_video_paths = []
+    combined_video_paths = []
+
+    combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
+    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
+    # 如果 new_script_list 中没有 video，则使用 subclip_path_videos 中的视频
+    video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list]
+
+    merger_video.combine_clip_videos(
+        output_video_path=combined_video_path,
+        video_paths=video_clips,
+        video_ost_list=video_ost,
+        video_aspect=params.video_aspect,
+        threads=params.n_threads
+    )
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)
+
+    """
+    6. 合并字幕/BGM/配音/视频
+    """
+    output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")
+
+    # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    bgm_path = utils.get_bgm_file()
+
+    # 调用示例
+    options = {
+        'voice_volume': params.tts_volume,  # 配音音量
+        'bgm_volume': params.bgm_volume,  # 背景音乐音量
+        'original_audio_volume': params.original_volume,  # 视频原声音量，0表示不保留
+        'keep_original_audio': True,  # 是否保留原声
+        'subtitle_font': params.font_name,  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': params.font_size,
+        'subtitle_color': params.text_fore_color,
+        'subtitle_bg_color': None,  # 直接使用None表示透明背景
+        'subtitle_position': params.subtitle_position,
+        'custom_position': params.custom_position,
+        'threads': params.n_threads
+    }
+    generate_video.merge_materials(
+        video_path=combined_video_path,
+        audio_path=merged_audio_path,
+        subtitle_path=merged_subtitle_path,
+        bgm_path=bgm_path,
+        output_path=output_video_path,
+        options=options
+    )
+
+    final_video_paths.append(output_video_path)
+    combined_video_paths.append(combined_video_path)
+
+    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
+
+    kwargs = {
+        "videos": final_video_paths,
+        "combined_videos": combined_video_paths
+    }
+    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
+    return kwargs
+
+
+def validate_params(video_path, audio_path, output_file, params):
+    """
+    验证输入参数
+    Args:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径（可以为空字符串）
+        output_file: 输出文件路径
+        params: 视频参数
+
+    Raises:
+        FileNotFoundError: 文件不存在时抛出
+        ValueError: 参数无效时抛出
+    """
+    if not video_path:
+        raise ValueError("视频路径不能为空")
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_path}")
+        
+    # 如果提供了音频路径，则验证文件是否存在
+    if audio_path and not os.path.exists(audio_path):
+        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+        
+    if not output_file:
+        raise ValueError("输出文件路径不能为空")
+    
+    # 确保输出目录存在
+    output_dir = os.path.dirname(output_file)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        
+    if not params:
+        raise ValueError("视频参数不能为空")
+
+
+if __name__ == "__main__":
+    task_id = "demo"
+
+    # 提前裁剪是为了方便检查视频
+    subclip_path_videos = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4',
+        3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4',
+    }
+
+    params = VideoClipParams(
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4",
+    )
+    start_subclip(task_id, params, subclip_path_videos)
diff --git a/app/services/update_script.py b/app/services/update_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb9663ca18c97f7109b0f6a7f929649daec5253
--- /dev/null
+++ b/app/services/update_script.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : update_script
+@Author : 小林同学
+@Date   : 2025/5/6 下午11:00 
+'''
+
+import re
+import os
+from typing import Dict, List, Any, Tuple, Union
+
+
+def extract_timestamp_from_video_path(video_path: str) -> str:
+    """
+    从视频文件路径中提取时间戳
+    
+    Args:
+        video_path: 视频文件路径
+    
+    Returns:
+        提取出的时间戳，格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss'
+    """
+    # 使用正则表达式从文件名中提取时间戳
+    filename = os.path.basename(video_path)
+    
+    # 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4
+    match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename)
+    if match_new:
+        # 提取并格式化时间戳（包含毫秒）
+        start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4)
+        end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8)
+        return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}"
+    
+    # 匹配旧格式: vid-00-00-00-00-00-00.mp4
+    match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename)
+    if match_old:
+        # 提取并格式化时间戳
+        start_time = match_old.group(1).replace('-', ':')
+        end_time = match_old.group(2).replace('-', ':')
+        return f"{start_time}-{end_time}"
+
+    return ""
+
+
+def calculate_duration(timestamp: str) -> float:
+    """
+    计算时间戳范围的持续时间（秒）
+    
+    Args:
+        timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳
+    
+    Returns:
+        持续时间（秒）
+    """
+    try:
+        start_time, end_time = timestamp.split('-')
+
+        # 处理毫秒部分
+        if ',' in start_time:
+            start_parts = start_time.split(',')
+            start_time_parts = start_parts[0].split(':')
+            start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0
+            start_h, start_m, start_s = map(int, start_time_parts)
+        else:
+            start_h, start_m, start_s = map(int, start_time.split(':'))
+            start_ms = 0
+
+        if ',' in end_time:
+            end_parts = end_time.split(',')
+            end_time_parts = end_parts[0].split(':')
+            end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0
+            end_h, end_m, end_s = map(int, end_time_parts)
+        else:
+            end_h, end_m, end_s = map(int, end_time.split(':'))
+            end_ms = 0
+
+        # 转换为秒
+        start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms
+        end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms
+
+        # 计算时间差（秒）
+        return round(end_seconds - start_seconds, 2)
+    except (ValueError, AttributeError):
+        return 0.0
+
+
+def update_script_timestamps(
+    script_list: List[Dict[str, Any]], 
+    video_result: Dict[Union[str, int], str], 
+    audio_result: Dict[Union[str, int], str] = None,
+    subtitle_result: Dict[Union[str, int], str] = None,
+    calculate_edited_timerange: bool = True
+) -> List[Dict[str, Any]]:
+    """
+    根据 video_result 中的视频文件更新 script_list 中的时间戳，添加持续时间，
+    并根据 audio_result 添加音频路径，根据 subtitle_result 添加字幕路径
+    
+    Args:
+        script_list: 原始脚本列表
+        video_result: 视频结果字典，键为原时间戳或_id，值为视频文件路径
+        audio_result: 音频结果字典，键为原时间戳或_id，值为音频文件路径
+        subtitle_result: 字幕结果字典，键为原时间戳或_id，值为字幕文件路径
+        calculate_edited_timerange: 是否计算并添加成品视频中的时间范围
+    
+    Returns:
+        更新后的脚本列表
+    """
+    # 创建副本，避免修改原始数据
+    updated_script = []
+
+    # 建立ID和时间戳到视频路径和新时间戳的映射
+    id_timestamp_mapping = {}
+    for key, video_path in video_result.items():
+        new_timestamp = extract_timestamp_from_video_path(video_path)
+        if new_timestamp:
+            id_timestamp_mapping[key] = {
+                'new_timestamp': new_timestamp,
+                'video_path': video_path
+            }
+
+    # 计算累积时长，用于生成成品视频中的时间范围
+    accumulated_duration = 0.0
+    
+    # 更新脚本中的时间戳
+    for item in script_list:
+        item_copy = item.copy()
+        item_id = item_copy.get('_id')
+        orig_timestamp = item_copy.get('timestamp', '')
+
+        # 初始化音频和字幕路径为空字符串
+        item_copy['audio'] = ""
+        item_copy['subtitle'] = ""
+        item_copy['video'] = ""  # 初始化视频路径为空字符串
+
+        # 如果提供了音频结果字典且ID存在于音频结果中，直接使用对应的音频路径
+        if audio_result:
+            if item_id and item_id in audio_result:
+                item_copy['audio'] = audio_result[item_id]
+            elif orig_timestamp in audio_result:
+                item_copy['audio'] = audio_result[orig_timestamp]
+
+        # 如果提供了字幕结果字典且ID存在于字幕结果中，直接使用对应的字幕路径
+        if subtitle_result:
+            if item_id and item_id in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[item_id]
+            elif orig_timestamp in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[orig_timestamp]
+
+        # 添加视频路径
+        if item_id and item_id in video_result:
+            item_copy['video'] = video_result[item_id]
+        elif orig_timestamp in video_result:
+            item_copy['video'] = video_result[orig_timestamp]
+
+        # 更新时间戳和计算持续时间
+        current_duration = 0.0
+        if item_id and item_id in id_timestamp_mapping:
+            # 根据ID找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp in id_timestamp_mapping:
+            # 根据原始时间戳找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp:
+            # 对于未更新的时间戳，也计算并添加持续时间
+            item_copy['sourceTimeRange'] = orig_timestamp
+            current_duration = calculate_duration(orig_timestamp)
+            item_copy['duration'] = current_duration
+            
+        # 计算片段在成品视频中的时间范围
+        if calculate_edited_timerange and current_duration > 0:
+            start_time_seconds = accumulated_duration
+            end_time_seconds = accumulated_duration + current_duration
+            
+            # 将秒数转换为 HH:MM:SS 格式
+            start_h = int(start_time_seconds // 3600)
+            start_m = int((start_time_seconds % 3600) // 60)
+            start_s = int(start_time_seconds % 60)
+            
+            end_h = int(end_time_seconds // 3600)
+            end_m = int((end_time_seconds % 3600) // 60)
+            end_s = int(end_time_seconds % 60)
+            
+            item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}"
+            
+            # 更新累积时长
+            accumulated_duration = end_time_seconds
+
+        updated_script.append(item_copy)
+
+    return updated_script
+
+
+if __name__ == '__main__':
+    list_script = [
+        {
+            'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+            'timestamp': '00:00:00,001-00:01:15,001',
+            'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+            'OST': 0,
+            '_id': 1
+        },
+        {
+            'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！',
+            'timestamp': '00:01:15,001-00:04:40,001',
+            'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+            'OST': 0,
+            '_id': 2
+        },
+        {
+            'picture': '画面切到王启年小心翼翼地向范闲汇报。',
+            'timestamp': '00:04:41,001-00:04:58,001',
+            'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+            'OST': 1,
+            '_id': 3
+        },
+        {
+            'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+            'timestamp': '00:04:58,001-00:05:45,001',
+            'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+            'OST': 0,
+            '_id': 4
+        },
+        {
+            'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'timestamp': '00:05:45,001-00:06:00,001',
+            'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'OST': 0,
+            '_id': 5
+        },
+        {
+            'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。',
+            'timestamp': '00:06:00,001-00:06:03,001',
+            'narration': '抓刺客',
+            'OST': 1,
+            '_id': 6
+        }]
+    video_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'}
+    audio_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}
+    sub_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'}
+    
+    # 更新并打印结果
+    updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res)
+    for item in updated_list_script:
+        print(
+            f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " +
+            f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " +
+            f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}")
diff --git a/app/services/video.py b/app/services/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..087dbdff0bdbe0cab1910b06cd47ae4f01dd2904
--- /dev/null
+++ b/app/services/video.py
@@ -0,0 +1,365 @@
+import traceback
+
+# import pysrt
+from typing import Optional
+from typing import List
+from loguru import logger
+from moviepy import *
+from PIL import ImageFont
+from contextlib import contextmanager
+from moviepy import (
+    VideoFileClip,
+    AudioFileClip,
+    TextClip,
+    CompositeVideoClip,
+    CompositeAudioClip
+)
+
+
+from app.models.schema import VideoAspect, SubtitlePosition
+
+
+def wrap_text(text, max_width, font, fontsize=60):
+    """
+    文本自动换行处理
+    Args:
+        text: 待处理的文本
+        max_width: 最大宽度
+        font: 字体文件路径
+        fontsize: 字体大小
+
+    Returns:
+        tuple: (换行后的文本, 文本高度)
+    """
+    # 创建字体对象
+    font = ImageFont.truetype(font, fontsize)
+
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 文本: {text}")
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        # logger.warning(f"wrapped text: {result}")
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    logger.debug(f"换行文本: {result}")
+    return result, height
+
+
+@contextmanager
+def manage_clip(clip):
+    """
+    视频片段资源管理器
+    Args:
+        clip: 视频片段对象
+
+    Yields:
+        VideoFileClip: 视频片段对象
+    """
+    try:
+        yield clip
+    finally:
+        clip.close()
+        del clip
+
+
+def resize_video_with_padding(clip, target_width: int, target_height: int):
+    """
+    调整视频尺寸并添加黑边
+    Args:
+        clip: 视频片段
+        target_width: 目标宽度
+        target_height: 目标高度
+
+    Returns:
+        CompositeVideoClip: 调整尺寸后的视频
+    """
+    clip_ratio = clip.w / clip.h
+    target_ratio = target_width / target_height
+
+    if clip_ratio == target_ratio:
+        return clip.resize((target_width, target_height))
+
+    if clip_ratio > target_ratio:
+        scale_factor = target_width / clip.w
+    else:
+        scale_factor = target_height / clip.h
+
+    new_width = int(clip.w * scale_factor)
+    new_height = int(clip.h * scale_factor)
+    clip_resized = clip.resize(newsize=(new_width, new_height))
+
+    background = ColorClip(
+        size=(target_width, target_height),
+        color=(0, 0, 0)
+    ).set_duration(clip.duration)
+
+    return CompositeVideoClip([
+        background,
+        clip_resized.set_position("center")
+    ])
+
+
+def loop_audio_clip(audio_clip: AudioFileClip, target_duration: float) -> AudioFileClip:
+    """
+    循环音频片段直到达到目标时长
+
+    参数:
+        audio_clip: 原始音频片段
+        target_duration: 目标时长（秒）
+    返回:
+        循环后的音频片段
+    """
+    # 计算需要循环的次数
+    loops_needed = int(target_duration / audio_clip.duration) + 1
+
+    # 创建足够长的音频
+    extended_audio = audio_clip
+    for _ in range(loops_needed - 1):
+        extended_audio = CompositeAudioClip([
+            extended_audio,
+            audio_clip.set_start(extended_audio.duration)
+        ])
+
+    # 裁剪到目标时长
+    return extended_audio.subclip(0, target_duration)
+
+
+def calculate_subtitle_position(position, video_height: int, text_height: int = 0) -> tuple:
+    """
+    计算字幕在视频中的具体位置
+    
+    Args:
+        position: 位置配置，可以是 SubtitlePosition 枚举值或表示距顶部百分比的浮点数
+        video_height: 视频高度
+        text_height: 字幕文本高度
+    
+    Returns:
+        tuple: (x, y) 坐标
+    """
+    margin = 50  # 字幕距离边缘的边距
+    
+    if isinstance(position, (int, float)):
+        # 百分比位置
+        return ('center', int(video_height * position))
+    
+    # 预设位置
+    if position == SubtitlePosition.TOP:
+        return ('center', margin)
+    elif position == SubtitlePosition.CENTER:
+        return ('center', video_height // 2)
+    elif position == SubtitlePosition.BOTTOM:
+        return ('center', video_height - margin - text_height)
+    
+    # 默认底部
+    return ('center', video_height - margin - text_height)
+
+
+def generate_video_v3(
+        video_path: str,
+        subtitle_style: dict,
+        volume_config: dict,
+        subtitle_path: Optional[str] = None,
+        bgm_path: Optional[str] = None,
+        narration_path: Optional[str] = None,
+        output_path: str = "output.mp4",
+        font_path: Optional[str] = None
+) -> None:
+    """
+    合并视频素材，包括视频、字幕、BGM和解说音频
+
+    参数:
+        video_path: 原视频文件路径
+        subtitle_path: SRT字幕文件路径（可选）
+        bgm_path: 背景音乐文件路径（可选）
+        narration_path: 解说音频文件路径（可选）
+        output_path: 输出文件路径
+        volume_config: 音量配置字典，可包含以下键：
+            - original: 原声音量（0-1），默认1.0
+            - bgm: BGM音量（0-1），默认0.3
+            - narration: 解说音量（0-1），默认1.0
+        subtitle_style: 字幕样式配置字典，可包含以下键：
+            - font: 字体名称
+            - fontsize: 字体大小
+            - color: 字体颜色
+            - stroke_color: 描边颜色
+            - stroke_width: 描边宽度
+            - bg_color: 背景色
+            - position: 位置支持 SubtitlePosition 枚举值或 0-1 之间的浮点数（表示距顶部的百分比）
+            - method: 文字渲染方法
+        font_path: 字体文件路径（.ttf/.otf 等格式）
+    """
+    # 检查视频文件是否存在
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_path}")
+
+    # 加载视频
+    video = VideoFileClip(video_path)
+    subtitle_clips = []
+
+    # 处理字幕（如果提供）
+    if subtitle_path:
+        if os.path.exists(subtitle_path):
+            # 检查字体文件
+            if font_path and not os.path.exists(font_path):
+                logger.warning(f"警告：字体文件不存在: {font_path}")
+
+            try:
+                subs = pysrt.open(subtitle_path)
+                logger.info(f"读取到 {len(subs)} 条字幕")
+
+                for index, sub in enumerate(subs):
+                    start_time = sub.start.ordinal / 1000
+                    end_time = sub.end.ordinal / 1000
+
+                    try:
+                        # 检查字幕文本是否为空
+                        if not sub.text or sub.text.strip() == '':
+                            logger.info(f"警告：第 {index + 1} 条字幕内容为空，已跳过")
+                            continue
+
+                        # 处理字幕文本：确保是字符串，并处理可能的列表情况
+                        if isinstance(sub.text, (list, tuple)):
+                            subtitle_text = ' '.join(str(item) for item in sub.text if item is not None)
+                        else:
+                            subtitle_text = str(sub.text)
+
+                        subtitle_text = subtitle_text.strip()
+
+                        if not subtitle_text:
+                            logger.info(f"警告：第 {index + 1} 条字幕处理后为空，已跳过")
+                            continue
+
+                        # 创建临时 TextClip 来获取文本高度
+                        temp_clip = TextClip(
+                            subtitle_text,
+                            font=font_path,
+                            fontsize=subtitle_style['fontsize'],
+                            color=subtitle_style['color']
+                        )
+                        text_height = temp_clip.h
+                        temp_clip.close()
+
+                        # 计算字幕位置
+                        position = calculate_subtitle_position(
+                            subtitle_style['position'],
+                            video.h,
+                            text_height
+                        )
+
+                        # 创建最终的 TextClip
+                        text_clip = (TextClip(
+                            subtitle_text,
+                            font=font_path,
+                            fontsize=subtitle_style['fontsize'],
+                            color=subtitle_style['color']
+                        )
+                            .set_position(position)
+                            .set_duration(end_time - start_time)
+                            .set_start(start_time))
+                        subtitle_clips.append(text_clip)
+
+                    except Exception as e:
+                        logger.error(f"警告：创建第 {index + 1} 条字幕时出错: {traceback.format_exc()}")
+
+                logger.info(f"成功创建 {len(subtitle_clips)} 条字幕剪辑")
+            except Exception as e:
+                logger.info(f"警告：处理字幕文件时出错: {str(e)}")
+        else:
+            logger.info(f"提示：字幕文件不存在: {subtitle_path}")
+
+    # 合并音频
+    audio_clips = []
+
+    # 添加原声（设置音量）
+    logger.debug(f"音量配置: {volume_config}")
+    if video.audio is not None:
+        original_audio = video.audio.volumex(volume_config['original'])
+        audio_clips.append(original_audio)
+
+    # 添加BGM（如果提供）
+    if bgm_path:
+        bgm = AudioFileClip(bgm_path)
+        if bgm.duration < video.duration:
+            bgm = loop_audio_clip(bgm, video.duration)
+        else:
+            bgm = bgm.subclip(0, video.duration)
+        bgm = bgm.volumex(volume_config['bgm'])
+        audio_clips.append(bgm)
+
+    # 添加解说音频（如果提供）
+    if narration_path:
+        narration = AudioFileClip(narration_path).volumex(volume_config['narration'])
+        audio_clips.append(narration)
+
+    # 合成最终视频（包含字幕）
+    if subtitle_clips:
+        final_video = CompositeVideoClip([video] + subtitle_clips, size=video.size)
+    else:
+        logger.info("警告：没有字幕被添加到视频中")
+        final_video = video
+
+    if audio_clips:
+        final_audio = CompositeAudioClip(audio_clips)
+        final_video = final_video.set_audio(final_audio)
+
+    # 导出视频
+    logger.info("开始导出视频...")  # 调试信息
+    final_video.write_videofile(
+        output_path,
+        codec='libx264',
+        audio_codec='aac',
+        fps=video.fps
+    )
+    logger.info(f"视频已导出到: {output_path}")  # 调试信息
+
+    # 清理资源
+    video.close()
+    for clip in subtitle_clips:
+        clip.close()
+    if bgm_path:
+        bgm.close()
+    if narration_path:
+        narration.close()
diff --git a/app/services/video_service.py b/app/services/video_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b2ddf067bf8833d5984d873dd8f332ad6b73a06
--- /dev/null
+++ b/app/services/video_service.py
@@ -0,0 +1,56 @@
+import os
+from uuid import uuid4
+from loguru import logger
+from typing import Dict, List, Optional, Tuple
+
+from app.services import material
+
+
+class VideoService:
+    @staticmethod
+    async def crop_video(
+        video_path: str,
+        video_script: List[dict]
+    ) -> Tuple[str, Dict[str, str]]:
+        """
+        裁剪视频服务
+        
+        Args:
+            video_path: 视频文件路径
+            video_script: 视频脚本列表
+            
+        Returns:
+            Tuple[str, Dict[str, str]]: (task_id, 裁剪后的视频片段字典)
+            视频片段字典格式: {timestamp: video_path}
+        """
+        try:
+            task_id = str(uuid4())
+            
+            # 从脚本中提取时间戳列表
+            time_list = [scene['timestamp'] for scene in video_script]
+            
+            # 调用裁剪服务
+            subclip_videos = material.clip_videos(
+                task_id=task_id,
+                timestamp_terms=time_list,
+                origin_video=video_path
+            )
+            
+            if subclip_videos is None:
+                raise ValueError("裁剪视频失败")
+                
+            # 更新脚本中的视频路径
+            for scene in video_script:
+                try:
+                    scene['path'] = subclip_videos[scene['timestamp']]
+                except KeyError as err:
+                    logger.error(f"更新视频路径失败: {err}")
+                    
+            logger.debug(f"裁剪视频成功，共生成 {len(time_list)} 个视频片段")
+            logger.debug(f"视频片段路径: {subclip_videos}")
+            
+            return task_id, subclip_videos
+            
+        except Exception as e:
+            logger.exception("裁剪视频失败")
+            raise 
\ No newline at end of file
diff --git a/app/services/voice.py b/app/services/voice.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f6d668e8e190aa5ca787202e091f907a5ac6dc
--- /dev/null
+++ b/app/services/voice.py
@@ -0,0 +1,1469 @@
+import os
+import re
+import json
+import traceback
+import edge_tts
+import asyncio
+from loguru import logger
+from typing import List, Union
+from datetime import datetime
+from xml.sax.saxutils import unescape
+from edge_tts import submaker, SubMaker
+from edge_tts.submaker import mktimestamp
+from moviepy.video.tools import subtitles
+import time
+
+from app.config import config
+from app.utils import utils
+
+
+def get_all_azure_voices(filter_locals=None) -> list[str]:
+    if filter_locals is None:
+        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
+    voices_str = """
+Name: af-ZA-AdriNeural
+Gender: Female
+
+Name: af-ZA-WillemNeural
+Gender: Male
+
+Name: am-ET-AmehaNeural
+Gender: Male
+
+Name: am-ET-MekdesNeural
+Gender: Female
+
+Name: ar-AE-FatimaNeural
+Gender: Female
+
+Name: ar-AE-HamdanNeural
+Gender: Male
+
+Name: ar-BH-AliNeural
+Gender: Male
+
+Name: ar-BH-LailaNeural
+Gender: Female
+
+Name: ar-DZ-AminaNeural
+Gender: Female
+
+Name: ar-DZ-IsmaelNeural
+Gender: Male
+
+Name: ar-EG-SalmaNeural
+Gender: Female
+
+Name: ar-EG-ShakirNeural
+Gender: Male
+
+Name: ar-IQ-BasselNeural
+Gender: Male
+
+Name: ar-IQ-RanaNeural
+Gender: Female
+
+Name: ar-JO-SanaNeural
+Gender: Female
+
+Name: ar-JO-TaimNeural
+Gender: Male
+
+Name: ar-KW-FahedNeural
+Gender: Male
+
+Name: ar-KW-NouraNeural
+Gender: Female
+
+Name: ar-LB-LaylaNeural
+Gender: Female
+
+Name: ar-LB-RamiNeural
+Gender: Male
+
+Name: ar-LY-ImanNeural
+Gender: Female
+
+Name: ar-LY-OmarNeural
+Gender: Male
+
+Name: ar-MA-JamalNeural
+Gender: Male
+
+Name: ar-MA-MounaNeural
+Gender: Female
+
+Name: ar-OM-AbdullahNeural
+Gender: Male
+
+Name: ar-OM-AyshaNeural
+Gender: Female
+
+Name: ar-QA-AmalNeural
+Gender: Female
+
+Name: ar-QA-MoazNeural
+Gender: Male
+
+Name: ar-SA-HamedNeural
+Gender: Male
+
+Name: ar-SA-ZariyahNeural
+Gender: Female
+
+Name: ar-SY-AmanyNeural
+Gender: Female
+
+Name: ar-SY-LaithNeural
+Gender: Male
+
+Name: ar-TN-HediNeural
+Gender: Male
+
+Name: ar-TN-ReemNeural
+Gender: Female
+
+Name: ar-YE-MaryamNeural
+Gender: Female
+
+Name: ar-YE-SalehNeural
+Gender: Male
+
+Name: az-AZ-BabekNeural
+Gender: Male
+
+Name: az-AZ-BanuNeural
+Gender: Female
+
+Name: bg-BG-BorislavNeural
+Gender: Male
+
+Name: bg-BG-KalinaNeural
+Gender: Female
+
+Name: bn-BD-NabanitaNeural
+Gender: Female
+
+Name: bn-BD-PradeepNeural
+Gender: Male
+
+Name: bn-IN-BashkarNeural
+Gender: Male
+
+Name: bn-IN-TanishaaNeural
+Gender: Female
+
+Name: bs-BA-GoranNeural
+Gender: Male
+
+Name: bs-BA-VesnaNeural
+Gender: Female
+
+Name: ca-ES-EnricNeural
+Gender: Male
+
+Name: ca-ES-JoanaNeural
+Gender: Female
+
+Name: cs-CZ-AntoninNeural
+Gender: Male
+
+Name: cs-CZ-VlastaNeural
+Gender: Female
+
+Name: cy-GB-AledNeural
+Gender: Male
+
+Name: cy-GB-NiaNeural
+Gender: Female
+
+Name: da-DK-ChristelNeural
+Gender: Female
+
+Name: da-DK-JeppeNeural
+Gender: Male
+
+Name: de-AT-IngridNeural
+Gender: Female
+
+Name: de-AT-JonasNeural
+Gender: Male
+
+Name: de-CH-JanNeural
+Gender: Male
+
+Name: de-CH-LeniNeural
+Gender: Female
+
+Name: de-DE-AmalaNeural
+Gender: Female
+
+Name: de-DE-ConradNeural
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural
+Gender: Male
+
+Name: de-DE-KatjaNeural
+Gender: Female
+
+Name: de-DE-KillianNeural
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural
+Gender: Female
+
+Name: el-GR-AthinaNeural
+Gender: Female
+
+Name: el-GR-NestorasNeural
+Gender: Male
+
+Name: en-AU-NatashaNeural
+Gender: Female
+
+Name: en-AU-WilliamNeural
+Gender: Male
+
+Name: en-CA-ClaraNeural
+Gender: Female
+
+Name: en-CA-LiamNeural
+Gender: Male
+
+Name: en-GB-LibbyNeural
+Gender: Female
+
+Name: en-GB-MaisieNeural
+Gender: Female
+
+Name: en-GB-RyanNeural
+Gender: Male
+
+Name: en-GB-SoniaNeural
+Gender: Female
+
+Name: en-GB-ThomasNeural
+Gender: Male
+
+Name: en-HK-SamNeural
+Gender: Male
+
+Name: en-HK-YanNeural
+Gender: Female
+
+Name: en-IE-ConnorNeural
+Gender: Male
+
+Name: en-IE-EmilyNeural
+Gender: Female
+
+Name: en-IN-NeerjaExpressiveNeural
+Gender: Female
+
+Name: en-IN-NeerjaNeural
+Gender: Female
+
+Name: en-IN-PrabhatNeural
+Gender: Male
+
+Name: en-KE-AsiliaNeural
+Gender: Female
+
+Name: en-KE-ChilembaNeural
+Gender: Male
+
+Name: en-NG-AbeoNeural
+Gender: Male
+
+Name: en-NG-EzinneNeural
+Gender: Female
+
+Name: en-NZ-MitchellNeural
+Gender: Male
+
+Name: en-NZ-MollyNeural
+Gender: Female
+
+Name: en-PH-JamesNeural
+Gender: Male
+
+Name: en-PH-RosaNeural
+Gender: Female
+
+Name: en-SG-LunaNeural
+Gender: Female
+
+Name: en-SG-WayneNeural
+Gender: Male
+
+Name: en-TZ-ElimuNeural
+Gender: Male
+
+Name: en-TZ-ImaniNeural
+Gender: Female
+
+Name: en-US-AnaNeural
+Gender: Female
+
+Name: en-US-AndrewNeural
+Gender: Male
+
+Name: en-US-AriaNeural
+Gender: Female
+
+Name: en-US-AvaNeural
+Gender: Female
+
+Name: en-US-BrianNeural
+Gender: Male
+
+Name: en-US-ChristopherNeural
+Gender: Male
+
+Name: en-US-EmmaNeural
+Gender: Female
+
+Name: en-US-EricNeural
+Gender: Male
+
+Name: en-US-GuyNeural
+Gender: Male
+
+Name: en-US-JennyNeural
+Gender: Female
+
+Name: en-US-MichelleNeural
+Gender: Female
+
+Name: en-US-RogerNeural
+Gender: Male
+
+Name: en-US-SteffanNeural
+Gender: Male
+
+Name: en-ZA-LeahNeural
+Gender: Female
+
+Name: en-ZA-LukeNeural
+Gender: Male
+
+Name: es-AR-ElenaNeural
+Gender: Female
+
+Name: es-AR-TomasNeural
+Gender: Male
+
+Name: es-BO-MarceloNeural
+Gender: Male
+
+Name: es-BO-SofiaNeural
+Gender: Female
+
+Name: es-CL-CatalinaNeural
+Gender: Female
+
+Name: es-CL-LorenzoNeural
+Gender: Male
+
+Name: es-CO-GonzaloNeural
+Gender: Male
+
+Name: es-CO-SalomeNeural
+Gender: Female
+
+Name: es-CR-JuanNeural
+Gender: Male
+
+Name: es-CR-MariaNeural
+Gender: Female
+
+Name: es-CU-BelkysNeural
+Gender: Female
+
+Name: es-CU-ManuelNeural
+Gender: Male
+
+Name: es-DO-EmilioNeural
+Gender: Male
+
+Name: es-DO-RamonaNeural
+Gender: Female
+
+Name: es-EC-AndreaNeural
+Gender: Female
+
+Name: es-EC-LuisNeural
+Gender: Male
+
+Name: es-ES-AlvaroNeural
+Gender: Male
+
+Name: es-ES-ElviraNeural
+Gender: Female
+
+Name: es-ES-XimenaNeural
+Gender: Female
+
+Name: es-GQ-JavierNeural
+Gender: Male
+
+Name: es-GQ-TeresaNeural
+Gender: Female
+
+Name: es-GT-AndresNeural
+Gender: Male
+
+Name: es-GT-MartaNeural
+Gender: Female
+
+Name: es-HN-CarlosNeural
+Gender: Male
+
+Name: es-HN-KarlaNeural
+Gender: Female
+
+Name: es-MX-DaliaNeural
+Gender: Female
+
+Name: es-MX-JorgeNeural
+Gender: Male
+
+Name: es-NI-FedericoNeural
+Gender: Male
+
+Name: es-NI-YolandaNeural
+Gender: Female
+
+Name: es-PA-MargaritaNeural
+Gender: Female
+
+Name: es-PA-RobertoNeural
+Gender: Male
+
+Name: es-PE-AlexNeural
+Gender: Male
+
+Name: es-PE-CamilaNeural
+Gender: Female
+
+Name: es-PR-KarinaNeural
+Gender: Female
+
+Name: es-PR-VictorNeural
+Gender: Male
+
+Name: es-PY-MarioNeural
+Gender: Male
+
+Name: es-PY-TaniaNeural
+Gender: Female
+
+Name: es-SV-LorenaNeural
+Gender: Female
+
+Name: es-SV-RodrigoNeural
+Gender: Male
+
+Name: es-US-AlonsoNeural
+Gender: Male
+
+Name: es-US-PalomaNeural
+Gender: Female
+
+Name: es-UY-MateoNeural
+Gender: Male
+
+Name: es-UY-ValentinaNeural
+Gender: Female
+
+Name: es-VE-PaolaNeural
+Gender: Female
+
+Name: es-VE-SebastianNeural
+Gender: Male
+
+Name: et-EE-AnuNeural
+Gender: Female
+
+Name: et-EE-KertNeural
+Gender: Male
+
+Name: fa-IR-DilaraNeural
+Gender: Female
+
+Name: fa-IR-FaridNeural
+Gender: Male
+
+Name: fi-FI-HarriNeural
+Gender: Male
+
+Name: fi-FI-NooraNeural
+Gender: Female
+
+Name: fil-PH-AngeloNeural
+Gender: Male
+
+Name: fil-PH-BlessicaNeural
+Gender: Female
+
+Name: fr-BE-CharlineNeural
+Gender: Female
+
+Name: fr-BE-GerardNeural
+Gender: Male
+
+Name: fr-CA-AntoineNeural
+Gender: Male
+
+Name: fr-CA-JeanNeural
+Gender: Male
+
+Name: fr-CA-SylvieNeural
+Gender: Female
+
+Name: fr-CA-ThierryNeural
+Gender: Male
+
+Name: fr-CH-ArianeNeural
+Gender: Female
+
+Name: fr-CH-FabriceNeural
+Gender: Male
+
+Name: fr-FR-DeniseNeural
+Gender: Female
+
+Name: fr-FR-EloiseNeural
+Gender: Female
+
+Name: fr-FR-HenriNeural
+Gender: Male
+
+Name: fr-FR-RemyMultilingualNeural
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural
+Gender: Female
+
+Name: ga-IE-ColmNeural
+Gender: Male
+
+Name: ga-IE-OrlaNeural
+Gender: Female
+
+Name: gl-ES-RoiNeural
+Gender: Male
+
+Name: gl-ES-SabelaNeural
+Gender: Female
+
+Name: gu-IN-DhwaniNeural
+Gender: Female
+
+Name: gu-IN-NiranjanNeural
+Gender: Male
+
+Name: he-IL-AvriNeural
+Gender: Male
+
+Name: he-IL-HilaNeural
+Gender: Female
+
+Name: hi-IN-MadhurNeural
+Gender: Male
+
+Name: hi-IN-SwaraNeural
+Gender: Female
+
+Name: hr-HR-GabrijelaNeural
+Gender: Female
+
+Name: hr-HR-SreckoNeural
+Gender: Male
+
+Name: hu-HU-NoemiNeural
+Gender: Female
+
+Name: hu-HU-TamasNeural
+Gender: Male
+
+Name: id-ID-ArdiNeural
+Gender: Male
+
+Name: id-ID-GadisNeural
+Gender: Female
+
+Name: is-IS-GudrunNeural
+Gender: Female
+
+Name: is-IS-GunnarNeural
+Gender: Male
+
+Name: it-IT-DiegoNeural
+Gender: Male
+
+Name: it-IT-ElsaNeural
+Gender: Female
+
+Name: it-IT-GiuseppeNeural
+Gender: Male
+
+Name: it-IT-IsabellaNeural
+Gender: Female
+
+Name: ja-JP-KeitaNeural
+Gender: Male
+
+Name: ja-JP-NanamiNeural
+Gender: Female
+
+Name: jv-ID-DimasNeural
+Gender: Male
+
+Name: jv-ID-SitiNeural
+Gender: Female
+
+Name: ka-GE-EkaNeural
+Gender: Female
+
+Name: ka-GE-GiorgiNeural
+Gender: Male
+
+Name: kk-KZ-AigulNeural
+Gender: Female
+
+Name: kk-KZ-DauletNeural
+Gender: Male
+
+Name: km-KH-PisethNeural
+Gender: Male
+
+Name: km-KH-SreymomNeural
+Gender: Female
+
+Name: kn-IN-GaganNeural
+Gender: Male
+
+Name: kn-IN-SapnaNeural
+Gender: Female
+
+Name: ko-KR-HyunsuNeural
+Gender: Male
+
+Name: ko-KR-InJoonNeural
+Gender: Male
+
+Name: ko-KR-SunHiNeural
+Gender: Female
+
+Name: lo-LA-ChanthavongNeural
+Gender: Male
+
+Name: lo-LA-KeomanyNeural
+Gender: Female
+
+Name: lt-LT-LeonasNeural
+Gender: Male
+
+Name: lt-LT-OnaNeural
+Gender: Female
+
+Name: lv-LV-EveritaNeural
+Gender: Female
+
+Name: lv-LV-NilsNeural
+Gender: Male
+
+Name: mk-MK-AleksandarNeural
+Gender: Male
+
+Name: mk-MK-MarijaNeural
+Gender: Female
+
+Name: ml-IN-MidhunNeural
+Gender: Male
+
+Name: ml-IN-SobhanaNeural
+Gender: Female
+
+Name: mn-MN-BataaNeural
+Gender: Male
+
+Name: mn-MN-YesuiNeural
+Gender: Female
+
+Name: mr-IN-AarohiNeural
+Gender: Female
+
+Name: mr-IN-ManoharNeural
+Gender: Male
+
+Name: ms-MY-OsmanNeural
+Gender: Male
+
+Name: ms-MY-YasminNeural
+Gender: Female
+
+Name: mt-MT-GraceNeural
+Gender: Female
+
+Name: mt-MT-JosephNeural
+Gender: Male
+
+Name: my-MM-NilarNeural
+Gender: Female
+
+Name: my-MM-ThihaNeural
+Gender: Male
+
+Name: nb-NO-FinnNeural
+Gender: Male
+
+Name: nb-NO-PernilleNeural
+Gender: Female
+
+Name: ne-NP-HemkalaNeural
+Gender: Female
+
+Name: ne-NP-SagarNeural
+Gender: Male
+
+Name: nl-BE-ArnaudNeural
+Gender: Male
+
+Name: nl-BE-DenaNeural
+Gender: Female
+
+Name: nl-NL-ColetteNeural
+Gender: Female
+
+Name: nl-NL-FennaNeural
+Gender: Female
+
+Name: nl-NL-MaartenNeural
+Gender: Male
+
+Name: pl-PL-MarekNeural
+Gender: Male
+
+Name: pl-PL-ZofiaNeural
+Gender: Female
+
+Name: ps-AF-GulNawazNeural
+Gender: Male
+
+Name: ps-AF-LatifaNeural
+Gender: Female
+
+Name: pt-BR-AntonioNeural
+Gender: Male
+
+Name: pt-BR-FranciscaNeural
+Gender: Female
+
+Name: pt-BR-ThalitaNeural
+Gender: Female
+
+Name: pt-PT-DuarteNeural
+Gender: Male
+
+Name: pt-PT-RaquelNeural
+Gender: Female
+
+Name: ro-RO-AlinaNeural
+Gender: Female
+
+Name: ro-RO-EmilNeural
+Gender: Male
+
+Name: ru-RU-DmitryNeural
+Gender: Male
+
+Name: ru-RU-SvetlanaNeural
+Gender: Female
+
+Name: si-LK-SameeraNeural
+Gender: Male
+
+Name: si-LK-ThiliniNeural
+Gender: Female
+
+Name: sk-SK-LukasNeural
+Gender: Male
+
+Name: sk-SK-ViktoriaNeural
+Gender: Female
+
+Name: sl-SI-PetraNeural
+Gender: Female
+
+Name: sl-SI-RokNeural
+Gender: Male
+
+Name: so-SO-MuuseNeural
+Gender: Male
+
+Name: so-SO-UbaxNeural
+Gender: Female
+
+Name: sq-AL-AnilaNeural
+Gender: Female
+
+Name: sq-AL-IlirNeural
+Gender: Male
+
+Name: sr-RS-NicholasNeural
+Gender: Male
+
+Name: sr-RS-SophieNeural
+Gender: Female
+
+Name: su-ID-JajangNeural
+Gender: Male
+
+Name: su-ID-TutiNeural
+Gender: Female
+
+Name: sv-SE-MattiasNeural
+Gender: Male
+
+Name: sv-SE-SofieNeural
+Gender: Female
+
+Name: sw-KE-RafikiNeural
+Gender: Male
+
+Name: sw-KE-ZuriNeural
+Gender: Female
+
+Name: sw-TZ-DaudiNeural
+Gender: Male
+
+Name: sw-TZ-RehemaNeural
+Gender: Female
+
+Name: ta-IN-PallaviNeural
+Gender: Female
+
+Name: ta-IN-ValluvarNeural
+Gender: Male
+
+Name: ta-LK-KumarNeural
+Gender: Male
+
+Name: ta-LK-SaranyaNeural
+Gender: Female
+
+Name: ta-MY-KaniNeural
+Gender: Female
+
+Name: ta-MY-SuryaNeural
+Gender: Male
+
+Name: ta-SG-AnbuNeural
+Gender: Male
+
+Name: ta-SG-VenbaNeural
+Gender: Female
+
+Name: te-IN-MohanNeural
+Gender: Male
+
+Name: te-IN-ShrutiNeural
+Gender: Female
+
+Name: th-TH-NiwatNeural
+Gender: Male
+
+Name: th-TH-PremwadeeNeural
+Gender: Female
+
+Name: tr-TR-AhmetNeural
+Gender: Male
+
+Name: tr-TR-EmelNeural
+Gender: Female
+
+Name: uk-UA-OstapNeural
+Gender: Male
+
+Name: uk-UA-PolinaNeural
+Gender: Female
+
+Name: ur-IN-GulNeural
+Gender: Female
+
+Name: ur-IN-SalmanNeural
+Gender: Male
+
+Name: ur-PK-AsadNeural
+Gender: Male
+
+Name: ur-PK-UzmaNeural
+Gender: Female
+
+Name: uz-UZ-MadinaNeural
+Gender: Female
+
+Name: uz-UZ-SardorNeural
+Gender: Male
+
+Name: vi-VN-HoaiMyNeural
+Gender: Female
+
+Name: vi-VN-NamMinhNeural
+Gender: Male
+
+Name: zh-CN-XiaoxiaoNeural
+Gender: Female
+
+Name: zh-CN-XiaoyiNeural
+Gender: Female
+
+Name: zh-CN-YunjianNeural
+Gender: Male
+
+Name: zh-CN-YunxiNeural
+Gender: Male
+
+Name: zh-CN-YunxiaNeural
+Gender: Male
+
+Name: zh-CN-YunyangNeural
+Gender: Male
+
+Name: zh-CN-liaoning-XiaobeiNeural
+Gender: Female
+
+Name: zh-CN-shaanxi-XiaoniNeural
+Gender: Female
+
+Name: zh-HK-HiuGaaiNeural
+Gender: Female
+
+Name: zh-HK-HiuMaanNeural
+Gender: Female
+
+Name: zh-HK-WanLungNeural
+Gender: Male
+
+Name: zh-TW-HsiaoChenNeural
+Gender: Female
+
+Name: zh-TW-HsiaoYuNeural
+Gender: Female
+
+Name: zh-TW-YunJheNeural
+Gender: Male
+
+Name: zu-ZA-ThandoNeural
+Gender: Female
+
+Name: zu-ZA-ThembaNeural
+Gender: Male
+
+
+Name: en-US-AvaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-AndrewMultilingualNeural-V2
+Gender: Male
+
+Name: en-US-EmmaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-BrianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural-V2
+Gender: Female
+
+Name: fr-FR-RemyMultilingualNeural-V2
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural-V2
+Gender: Female
+
+Name: zh-CN-XiaoxiaoMultilingualNeural-V2
+Gender: Female
+
+Name: zh-CN-YunxiNeural-V2
+Gender: Male
+    """.strip()
+    voices = []
+    name = ""
+    for line in voices_str.split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("Name: "):
+            name = line[6:].strip()
+        if line.startswith("Gender: "):
+            gender = line[8:].strip()
+            if name and gender:
+                # voices.append({
+                #     "name": name,
+                #     "gender": gender,
+                # })
+                if filter_locals:
+                    for filter_local in filter_locals:
+                        if name.lower().startswith(filter_local.lower()):
+                            voices.append(f"{name}-{gender}")
+                else:
+                    voices.append(f"{name}-{gender}")
+                name = ""
+    voices.sort()
+    return voices
+
+
+def parse_voice_name(name: str):
+    # zh-CN-XiaoyiNeural-Female
+    # zh-CN-YunxiNeural-Male
+    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
+    name = name.replace("-Female", "").replace("-Male", "").strip()
+    return name
+
+
+def is_azure_v2_voice(voice_name: str):
+    voice_name = parse_voice_name(voice_name)
+    if voice_name.endswith("-V2"):
+        return voice_name.replace("-V2", "").strip()
+    return ""
+
+
+def tts(
+    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
+) -> Union[SubMaker, None]:
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
+    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
+
+
+def convert_rate_to_percent(rate: float) -> str:
+    if rate == 1.0:
+        return "+0%"
+    percent = round((rate - 1.0) * 100)
+    if percent > 0:
+        return f"+{percent}%"
+    else:
+        return f"{percent}%"
+
+
+def convert_pitch_to_percent(rate: float) -> str:
+    if rate == 1.0:
+        return "+0Hz"
+    percent = round((rate - 1.0) * 100)
+    if percent > 0:
+        return f"+{percent}Hz"
+    else:
+        return f"{percent}Hz"
+
+
+def azure_tts_v1(
+    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
+) -> Union[SubMaker, None]:
+    voice_name = parse_voice_name(voice_name)
+    text = text.strip()
+    rate_str = convert_rate_to_percent(voice_rate)
+    pitch_str = convert_pitch_to_percent(voice_pitch)
+    for i in range(3):
+        try:
+            logger.info(f"第 {i+1} 次使用 edge_tts 生成音频")
+
+            async def _do() -> tuple[SubMaker, bytes]:
+                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
+                sub_maker = edge_tts.SubMaker()
+                audio_data = bytes()  # 用于存储音频数据
+                
+                async for chunk in communicate.stream():
+                    if chunk["type"] == "audio":
+                        audio_data += chunk["data"]
+                    elif chunk["type"] == "WordBoundary":
+                        sub_maker.create_sub(
+                            (chunk["offset"], chunk["duration"]), chunk["text"]
+                        )
+                return sub_maker, audio_data
+
+            # 获取音频数据和字幕信息
+            sub_maker, audio_data = asyncio.run(_do())
+            
+            # 验证数据是否有效
+            if not sub_maker or not sub_maker.subs or not audio_data:
+                logger.warning(f"failed, invalid data generated")
+                if i < 2:
+                    time.sleep(1)
+                continue
+
+            # 数据有效，写入文件
+            with open(voice_file, "wb") as file:
+                file.write(audio_data)
+            return sub_maker
+        except Exception as e:
+            logger.error(f"生成音频文件时出错: {str(e)}")
+            if i < 2:
+                time.sleep(1)
+    return None
+
+
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
+    voice_name = is_azure_v2_voice(voice_name)
+    if not voice_name:
+        logger.error(f"invalid voice name: {voice_name}")
+        raise ValueError(f"invalid voice name: {voice_name}")
+    text = text.strip()
+
+    def _format_duration_to_offset(duration) -> int:
+        if isinstance(duration, str):
+            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
+            milliseconds = (
+                (time_obj.hour * 3600000)
+                + (time_obj.minute * 60000)
+                + (time_obj.second * 1000)
+                + (time_obj.microsecond // 1000)
+            )
+            return milliseconds * 10000
+
+        if isinstance(duration, int):
+            return duration
+
+        return 0
+
+    for i in range(3):
+        try:
+            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+
+            import azure.cognitiveservices.speech as speechsdk
+
+            sub_maker = SubMaker()
+
+            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
+                duration = _format_duration_to_offset(str(evt.duration))
+                offset = _format_duration_to_offset(evt.audio_offset)
+                sub_maker.subs.append(evt.text)
+                sub_maker.offset.append((offset, offset + duration))
+
+            # Creates an instance of a speech config with specified subscription key and service region.
+            speech_key = config.azure.get("speech_key", "")
+            service_region = config.azure.get("speech_region", "")
+            audio_config = speechsdk.audio.AudioOutputConfig(
+                filename=voice_file, use_default_speaker=True
+            )
+            speech_config = speechsdk.SpeechConfig(
+                subscription=speech_key, region=service_region
+            )
+            speech_config.speech_synthesis_voice_name = voice_name
+            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
+            #                            value='true')
+            speech_config.set_property(
+                property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
+                value="true",
+            )
+
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3
+            )
+            speech_synthesizer = speechsdk.SpeechSynthesizer(
+                audio_config=audio_config, speech_config=speech_config
+            )
+            speech_synthesizer.synthesis_word_boundary.connect(
+                speech_synthesizer_word_boundary_cb
+            )
+
+            result = speech_synthesizer.speak_text_async(text).get()
+            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+                logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
+                return sub_maker
+            elif result.reason == speechsdk.ResultReason.Canceled:
+                cancellation_details = result.cancellation_details
+                logger.error(
+                    f"azure v2 speech synthesis canceled: {cancellation_details.reason}"
+                )
+                if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                    logger.error(
+                        f"azure v2 speech synthesis error: {cancellation_details.error_details}"
+                    )
+            if i < 2:  # 如果不是最后一次重试，则等待1秒
+                time.sleep(1)
+            logger.info(f"completed, output file: {voice_file}")
+        except Exception as e:
+            logger.error(f"failed, error: {str(e)}")
+            if i < 2:  # 如果不是最后一次重试，则等待1秒
+                time.sleep(3)
+    return None
+
+
+def _format_text(text: str) -> str:
+    text = text.replace("\n", " ")
+    text = text.replace("\"", " ")
+    text = text.replace("[", " ")
+    text = text.replace("]", " ")
+    text = text.replace("(", " ")
+    text = text.replace(")", " ")
+    text = text.replace("）", " ")
+    text = text.replace("（", " ")
+    text = text.replace("{", " ")
+    text = text.replace("}", " ")
+    text = text.strip()
+    return text
+
+
+def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], list_script: List[dict], 
+                                  subtitle_file: str):
+    """
+    根据多个 SubMaker 对象、完整文本和原始脚本创建优化的字幕文件
+    1. 使用原始脚本中的时间戳
+    2. 跳过 OST 为 true 的部分
+    3. 将字幕文件按照标点符号分割成多行
+    4. 根据完整文本分段，保持原文的语句结构
+    5. 生成新的字幕文件，时间戳包含小时单位
+    """
+    text = _format_text(text)
+    sentences = utils.split_string_by_punctuations(text)
+
+    def formatter(idx: int, start_time: str, end_time: str, sub_text: str) -> str:
+        return f"{idx}\n{start_time.replace('.', ',')} --> {end_time.replace('.', ',')}\n{sub_text}\n"
+
+    sub_items = []
+    sub_index = 0
+    sentence_index = 0
+
+    try:
+        sub_maker_index = 0
+        for script_item in list_script:
+            if script_item['OST']:
+                continue
+
+            start_time, end_time = script_item['timestamp'].split('-')
+            if sub_maker_index >= len(sub_maker_list):
+                logger.error(f"Sub maker list index out of range: {sub_maker_index}")
+                break
+            sub_maker = sub_maker_list[sub_maker_index]
+            sub_maker_index += 1
+
+            script_duration = utils.time_to_seconds(end_time) - utils.time_to_seconds(start_time)
+            audio_duration = get_audio_duration(sub_maker)
+            time_ratio = script_duration / audio_duration if audio_duration > 0 else 1
+
+            current_sub = ""
+            current_start = None
+            current_end = None
+
+            for offset, sub in zip(sub_maker.offset, sub_maker.subs):
+                sub = unescape(sub).strip()
+                sub_start = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[0] / 10000000 * time_ratio)
+                sub_end = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[1] / 10000000 * time_ratio)
+                
+                if current_start is None:
+                    current_start = sub_start
+                current_end = sub_end
+                
+                current_sub += sub
+                
+                # 检查当前累积的字幕是否匹配下一个句子
+                while sentence_index < len(sentences) and sentences[sentence_index] in current_sub:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=sentences[sentence_index].strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = current_sub.replace(sentences[sentence_index], "", 1).strip()
+                    current_start = current_end
+                    sentence_index += 1
+
+                # 如果当前字幕长度超过15个字符，也生成一个新的字幕项
+                if len(current_sub) > 15:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=current_sub.strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = ""
+                    current_start = current_end
+
+            # 处理剩余的文本
+            if current_sub.strip():
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=current_start,
+                    end_time=current_end,
+                    sub_text=current_sub.strip(),
+                )
+                sub_items.append(line)
+
+        if len(sub_items) == 0:
+            logger.error("No subtitle items generated")
+            return
+
+        with open(subtitle_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(sub_items))
+
+        logger.info(f"completed, subtitle file created: {subtitle_file}")
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+        traceback.print_exc()
+
+
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    def match_line(_sub_line: str, _sub_index: int):
+        if len(script_lines) <= _sub_index:
+            return ""
+
+        _line = script_lines[_sub_index]
+        if _sub_line == _line:
+            return script_lines[_sub_index].strip()
+
+        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
+        _line_ = re.sub(r"[^\w\s]", "", _line)
+        if _sub_line_ == _line_:
+            return _line_.strip()
+
+        _sub_line_ = re.sub(r"\W+", "", _sub_line)
+        _line_ = re.sub(r"\W+", "", _line)
+        if _sub_line_ == _line_:
+            return _line.strip()
+
+        return ""
+
+    sub_line = ""
+
+    try:
+        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+            _start_time, end_time = offset
+            if start_time < 0:
+                start_time = _start_time
+
+            sub = unescape(sub)
+            sub_line += sub
+            sub_text = match_line(sub_line, sub_index)
+            if sub_text:
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=start_time,
+                    end_time=end_time,
+                    sub_text=sub_text,
+                )
+                sub_items.append(line)
+                start_time = -1.0
+                sub_line = ""
+
+        if len(sub_items) == len(script_lines):
+            with open(subtitle_file, "w", encoding="utf-8") as file:
+                file.write("\n".join(sub_items) + "\n")
+            try:
+                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
+                duration = max([tb for ((ta, tb), txt) in sbs])
+                logger.info(
+                    f"已创建字幕文件: {subtitle_file}, duration: {duration}"
+                )
+                return subtitle_file, duration
+            except Exception as e:
+                logger.error(f"failed, error: {str(e)}")
+                os.remove(subtitle_file)
+        else:
+            logger.error(
+                f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}"
+                f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
+                f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
+            )
+
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+
+
+def get_audio_duration(sub_maker: submaker.SubMaker):
+    """
+    获取音频时长
+    """
+    if not sub_maker.offset:
+        return 0.0
+    return sub_maker.offset[-1][1] / 10000000
+
+
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
+    """
+    根据JSON文件中的多段文本进行TTS转换
+    
+    :param task_id: 任务ID
+    :param list_script: 脚本列表
+    :param voice_name: 语音名称
+    :param voice_rate: 语音速率
+    :return: 生成的音频文件列表
+    """
+    voice_name = parse_voice_name(voice_name)
+    output_dir = utils.task_dir(task_id)
+    tts_results = []
+
+    for item in list_script:
+        if item['OST'] != 1:
+            # 将时间戳中的冒号替换为下划线
+            timestamp = item['timestamp'].replace(':', '_')
+            audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
+            subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt")
+
+            text = item['narration']
+
+            sub_maker = tts(
+                text=text,
+                voice_name=voice_name,
+                voice_rate=voice_rate,
+                voice_pitch=voice_pitch,
+                voice_file=audio_file,
+            )
+
+            if sub_maker is None:
+                logger.error(f"无法为时间戳 {timestamp} 生成音频; "
+                             f"如果您在中国，请使用VPN; "
+                             f"或者使用其他 tts 引擎")
+                continue
+            else:
+                # 为当前片段生成字幕文件
+                _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
+
+            tts_results.append({
+                "_id": item['_id'],
+                "timestamp": item['timestamp'],
+                "audio_file": audio_file,
+                "subtitle_file": subtitle_file,
+                "duration": duration,
+                "text": text,
+            })
+            logger.info(f"已生成音频文件: {audio_file}")
+
+    return tts_results
diff --git a/app/services/youtube_service.py b/app/services/youtube_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a7a79cd98ea8356a02830df9cf965e0a7151ed
--- /dev/null
+++ b/app/services/youtube_service.py
@@ -0,0 +1,146 @@
+import yt_dlp
+import os
+from typing import List, Dict, Optional, Tuple
+from loguru import logger
+from uuid import uuid4
+
+from app.utils import utils
+from app.services import video as VideoService
+
+
+class YoutubeService:
+    def __init__(self):
+        self.supported_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi']
+
+    def _get_video_formats(self, url: str) -> List[Dict]:
+        """获取视频可用的格式列表"""
+        ydl_opts = {
+            'quiet': True,
+            'no_warnings': True
+        }
+
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+                formats = info.get('formats', [])
+
+                format_list = []
+                for f in formats:
+                    format_info = {
+                        'format_id': f.get('format_id', 'N/A'),
+                        'ext': f.get('ext', 'N/A'),
+                        'resolution': f.get('format_note', 'N/A'),
+                        'filesize': f.get('filesize', 'N/A'),
+                        'vcodec': f.get('vcodec', 'N/A'),
+                        'acodec': f.get('acodec', 'N/A')
+                    }
+                    format_list.append(format_info)
+
+                return format_list
+        except Exception as e:
+            logger.error(f"获取视频格式失败: {str(e)}")
+            raise
+
+    def _validate_format(self, output_format: str) -> None:
+        """验证输出格式是否支持"""
+        if output_format.lower() not in self.supported_formats:
+            raise ValueError(
+                f"不支持的视频格式: {output_format}。"
+                f"支持的格式: {', '.join(self.supported_formats)}"
+            )
+
+    async def download_video(
+            self,
+            url: str,
+            resolution: str,
+            output_format: str = 'mp4',
+            rename: Optional[str] = None
+    ) -> Tuple[str, str, str]:
+        """
+        下载指定分辨率的视频
+        
+        Args:
+            url: YouTube视频URL
+            resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.)
+                       注意：对于类似'1080p60'的输入会被处理为'1080p'
+            output_format: 输出视频格式
+            rename: 可选的重命名
+            
+        Returns:
+            Tuple[str, str, str]: (task_id, output_path, filename)
+        """
+        try:
+            task_id = str(uuid4())
+            self._validate_format(output_format)
+
+            # 标准化分辨率格式
+            base_resolution = resolution.split('p')[0] + 'p'
+            
+            # 获取所有可用格式
+            formats = self._get_video_formats(url)
+
+            # 查找指定分辨率的最佳视频格式
+            target_format = None
+            for fmt in formats:
+                fmt_resolution = fmt['resolution']
+                # 将格式的分辨率也标准化后进行比较
+                if fmt_resolution != 'N/A':
+                    fmt_base_resolution = fmt_resolution.split('p')[0] + 'p'
+                    if fmt_base_resolution == base_resolution and fmt['vcodec'] != 'none':
+                        target_format = fmt
+                        break
+
+            if target_format is None:
+                # 收集可用分辨率时也进行标准化
+                available_resolutions = set(
+                    fmt['resolution'].split('p')[0] + 'p'
+                    for fmt in formats
+                    if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none'
+                )
+                raise ValueError(
+                    f"未找到 {base_resolution} 分辨率的视频。"
+                    f"可用分辨率: {', '.join(sorted(available_resolutions))}"
+                )
+
+            # 创建输出目录
+            output_dir = utils.video_dir()
+            os.makedirs(output_dir, exist_ok=True)
+
+            # 设置下载选项
+            if rename:
+                # 如果指定了重命名，直接使用新名字
+                filename = f"{rename}.{output_format}"
+                output_template = os.path.join(output_dir, filename)
+            else:
+                # 否则使用任务ID和原标题
+                output_template = os.path.join(output_dir, f'{task_id}_%(title)s.%(ext)s')
+
+            ydl_opts = {
+                'format': f"{target_format['format_id']}+bestaudio[ext=m4a]/best",
+                'outtmpl': output_template,
+                'merge_output_format': output_format.lower(),
+                'postprocessors': [{
+                    'key': 'FFmpegVideoConvertor',
+                    'preferedformat': output_format.lower(),
+                }]
+            }
+
+            # 执行下载
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=True)
+                if rename:
+                    # 如果指定了重命名，使用新文件名
+                    output_path = output_template
+                    filename = os.path.basename(output_path)
+                else:
+                    # 否则使用原始标题
+                    video_title = info.get('title', task_id)
+                    filename = f"{task_id}_{video_title}.{output_format}"
+                    output_path = os.path.join(output_dir, filename)
+
+            logger.info(f"视频下载成功: {output_path}")
+            return task_id, output_path, filename
+
+        except Exception as e:
+            logger.exception("下载视频失败")
+            raise
diff --git a/app/test/test_gemini.py b/app/test/test_gemini.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa96a393810640fea64a5d5f8b16b1846a3eb1e0
--- /dev/null
+++ b/app/test/test_gemini.py
@@ -0,0 +1,14 @@
+import google.generativeai as genai
+from app.config import config
+import os
+
+os.environ["HTTP_PROXY"] = config.proxy.get("http")
+os.environ["HTTPS_PROXY"] = config.proxy.get("https")
+
+genai.configure(api_key="")
+model = genai.GenerativeModel("gemini-1.5-pro")
+
+
+for i in range(50):
+    response = model.generate_content("直接回复我文本'当前网络可用'")
+    print(i, response.text)
diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py
new file mode 100644
index 0000000000000000000000000000000000000000..79d93c24238c75f666fe8cecd5d97b77e94018ce
--- /dev/null
+++ b/app/test/test_moviepy.py
@@ -0,0 +1,122 @@
+"""
+使用 moviepy 库剪辑指定时间戳视频，支持时分秒毫秒精度
+"""
+
+from moviepy.editor import VideoFileClip
+from datetime import datetime
+import os
+
+
+def time_str_to_seconds(time_str: str) -> float:
+    """
+    将时间字符串转换为秒数
+    参数:
+        time_str: 格式为"HH:MM:SS,mmm"的时间字符串，例如"00:01:23,456"
+    返回:
+        转换后的秒数(float)
+    """
+    try:
+        # 分离时间和毫秒
+        time_part, ms_part = time_str.split(',')
+        # 转换时分秒
+        time_obj = datetime.strptime(time_part, "%H:%M:%S")
+        # 计算总秒数
+        total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        # 添加毫秒部分
+        total_seconds += int(ms_part) / 1000
+        return total_seconds
+    except ValueError as e:
+        raise ValueError("时间格式错误，请使用 HH:MM:SS,mmm 格式，例如 00:01:23,456") from e
+
+
+def format_duration(seconds: float) -> str:
+    """
+    将秒数转换为可读的时间格式
+    参数:
+        seconds: 秒数
+    返回:
+        格式化的时间字符串 (HH:MM:SS,mmm)
+    """
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+
+
+def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
+    """
+    剪辑视频
+    参数:
+        video_path: 视频文件路径
+        start_time: 开始时间 (格式: "HH:MM:SS,mmm")
+        end_time: 结束时间 (格式: "HH:MM:SS,mmm")
+        output_path: 输出文件路径
+    """
+    try:
+        # 确保输出目录存在
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+            
+        # 如果输出文件已存在，先尝试删除
+        if os.path.exists(output_path):
+            try:
+                os.remove(output_path)
+            except PermissionError:
+                print(f"无法删除已存在的文件：{output_path}，请确保文件未被其他程序占用")
+                return
+        
+        # 转换时间字符串为秒数
+        start_seconds = time_str_to_seconds(start_time)
+        end_seconds = time_str_to_seconds(end_time)
+        
+        # 加载视频文件
+        video = VideoFileClip(video_path)
+        
+        # 验证时间范围
+        if start_seconds >= video.duration or end_seconds > video.duration:
+            raise ValueError(f"剪辑时间超出视频长度！视频总长度为: {format_duration(video.duration)}")
+        
+        if start_seconds >= end_seconds:
+            raise ValueError("结束时间必须大于开始时间！")
+        
+        # 计算剪辑时长
+        clip_duration = end_seconds - start_seconds
+        print(f"原视频总长度: {format_duration(video.duration)}")
+        print(f"剪辑时长: {format_duration(clip_duration)}")
+        print(f"剪辑区间: {start_time} -> {end_time}")
+        
+        # 剪辑视频
+        video = video.subclip(start_seconds, end_seconds)
+        
+        # 添加错误处理的写入过程
+        try:
+            video.write_videofile(
+                output_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True
+            )
+        except IOError as e:
+            print(f"写入视频文件时发生错误：{str(e)}")
+            raise
+        finally:
+            # 确保资源被释放
+            video.close()
+            
+    except Exception as e:
+        print(f"视频剪辑过程中发生错误：{str(e)}")
+        raise
+
+
+if __name__ == "__main__":
+    cut_video(
+        video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4",
+        start_time="00:00:00,789",
+        end_time="00:02:00,123",
+        output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4"
+    )
diff --git a/app/test/test_moviepy_merge.py b/app/test/test_moviepy_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..da08e6ce9ac9bff14325456a99bcf8f510925d59
--- /dev/null
+++ b/app/test/test_moviepy_merge.py
@@ -0,0 +1,143 @@
+"""
+使用 moviepy 合并视频、音频、字幕和背景音乐
+"""
+
+from moviepy.editor import (
+    VideoFileClip,
+    AudioFileClip,
+    TextClip,
+    CompositeVideoClip,
+    concatenate_videoclips
+)
+# from moviepy.config import change_settings
+import os
+
+# 设置字体文件路径（用于中文字幕显示）
+FONT_PATH = "../../resource/fonts/STHeitiMedium.ttc"  # 请确保此路径下有对应字体文件
+# change_settings(
+#     {"IMAGEMAGICK_BINARY": r"C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe"})  # Windows系统需要设置 ImageMagick 路径
+
+
+class VideoMerger:
+    """视频合并处理类"""
+
+    def __init__(self, output_path: str = "../../resource/videos/merged_video.mp4"):
+        """
+        初始化视频合并器
+        参数:
+            output_path: 输出文件路径
+        """
+        self.output_path = output_path
+        self.video_clips = []
+        self.background_music = None
+        self.subtitles = []
+
+    def add_video(self, video_path: str, start_time: str = None, end_time: str = None) -> None:
+        """
+        添加视频片段
+        参数:
+            video_path: 视频文件路径
+            start_time: 开始时间 (格式: "MM:SS")
+            end_time: 结束时间 (格式: "MM:SS")
+        """
+        video = VideoFileClip(video_path)
+        if start_time and end_time:
+            video = video.subclip(self._time_to_seconds(start_time),
+                                  self._time_to_seconds(end_time))
+        self.video_clips.append(video)
+
+    def add_audio(self, audio_path: str, volume: float = 1.0) -> None:
+        """
+        添加背景音乐
+        参数:
+            audio_path: 音频文件路径
+            volume: 音量大小 (0.0-1.0)
+        """
+        self.background_music = AudioFileClip(audio_path).volumex(volume)
+
+    def add_subtitle(self, text: str, start_time: str, end_time: str,
+                     position: tuple = ('center', 'bottom'), fontsize: int = 24) -> None:
+        """
+        添加字幕
+        参数:
+            text: 字幕文本
+            start_time: 开始时间 (格式: "MM:SS")
+            end_time: 结束时间 (格式: "MM:SS")
+            position: 字幕位置
+            fontsize: 字体大小
+        """
+        subtitle = TextClip(
+            text,
+            font=FONT_PATH,
+            fontsize=fontsize,
+            color='white',
+            stroke_color='black',
+            stroke_width=2
+        )
+
+        subtitle = subtitle.set_position(position).set_duration(
+            self._time_to_seconds(end_time) - self._time_to_seconds(start_time)
+        ).set_start(self._time_to_seconds(start_time))
+
+        self.subtitles.append(subtitle)
+
+    def merge(self) -> None:
+        """合并所有媒体元素并导出视频"""
+        if not self.video_clips:
+            raise ValueError("至少需要添加一个视频片段")
+
+        # 合并视频片段
+        final_video = concatenate_videoclips(self.video_clips)
+
+        # 如果有背景音乐，设置其持续时间与视频相同
+        if self.background_music:
+            self.background_music = self.background_music.set_duration(final_video.duration)
+            final_video = final_video.set_audio(self.background_music)
+
+        # 添加字幕
+        if self.subtitles:
+            final_video = CompositeVideoClip([final_video] + self.subtitles)
+
+        # 导出最终视频
+        final_video.write_videofile(
+            self.output_path,
+            fps=24,
+            codec='libx264',
+            audio_codec='aac'
+        )
+
+        # 释放资源
+        final_video.close()
+        for clip in self.video_clips:
+            clip.close()
+        if self.background_music:
+            self.background_music.close()
+
+    @staticmethod
+    def _time_to_seconds(time_str: str) -> float:
+        """将时间字符串转换为秒数"""
+        minutes, seconds = map(int, time_str.split(':'))
+        return minutes * 60 + seconds
+
+
+def test_merge_video():
+    """测试视频合并功能"""
+    merger = VideoMerger()
+
+    # 添加两个视频片段
+    merger.add_video("../../resource/videos/cut_video.mp4", "00:00", "01:00")
+    merger.add_video("../../resource/videos/demo.mp4", "00:00", "00:30")
+
+    # 添加背景音乐
+    merger.add_audio("../../resource/songs/output000.mp3", volume=0.3)
+
+    # 添加字幕
+    merger.add_subtitle("第一个精彩片段", "00:00", "00:05")
+    merger.add_subtitle("第二个精彩片段", "01:00", "01:05")
+
+    # 合并并导出
+    merger.merge()
+
+
+if __name__ == "__main__":
+    test_merge_video()
diff --git a/app/test/test_moviepy_speed.py b/app/test/test_moviepy_speed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3697ba20f61c8a8938510cf1a25b587e95a838d9
--- /dev/null
+++ b/app/test/test_moviepy_speed.py
@@ -0,0 +1,142 @@
+"""
+使用 moviepy 优化视频处理速度的示例
+包含：视频加速、多核处理、预设参数优化等
+"""
+
+from moviepy.editor import VideoFileClip
+from moviepy.video.fx.speedx import speedx
+import multiprocessing as mp
+import time
+
+
+class VideoSpeedProcessor:
+    """视频速度处理器"""
+
+    def __init__(self, input_path: str, output_path: str):
+        self.input_path = input_path
+        self.output_path = output_path
+        # 获取CPU核心数
+        self.cpu_cores = mp.cpu_count()
+
+    def process_with_optimization(self, speed_factor: float = 1.0) -> None:
+        """
+        使用优化参数处理视频
+        参数:
+            speed_factor: 速度倍数 (1.0 为原速, 2.0 为双倍速)
+        """
+        start_time = time.time()
+        
+        # 加载视频时使用优化参数
+        video = VideoFileClip(
+            self.input_path,
+            audio=True,  # 如果不需要音频可以设为False
+            target_resolution=(720, None),  # 可以降低分辨率加快处理
+            resize_algorithm='fast_bilinear'  # 使用快速的重置算法
+        )
+
+        # 应用速度变化
+        if speed_factor != 1.0:
+            video = speedx(video, factor=speed_factor)
+
+        # 使用优化参数导出视频
+        video.write_videofile(
+            self.output_path,
+            codec='libx264',  # 使用h264编码
+            audio_codec='aac',  # 音频编码
+            temp_audiofile='temp-audio.m4a',  # 临时音频文件
+            remove_temp=True,  # 处理完成后删除临时文件
+            write_logfile=False,  # 关闭日志文件
+            threads=self.cpu_cores,  # 使用多核处理
+            preset='ultrafast',  # 使用最快的编码预设
+            ffmpeg_params=[
+                '-brand', 'mp42',
+                '-crf', '23',  # 压缩率，范围0-51，数值越大压缩率越高
+            ]
+        )
+
+        # 释放资源
+        video.close()
+
+        end_time = time.time()
+        print(f"处理完成！用时: {end_time - start_time:.2f} 秒")
+
+    def batch_process_segments(self, segment_times: list, speed_factor: float = 1.0) -> None:
+        """
+        批量处理视频片段（并行处理）
+        参数:
+            segment_times: 列表，包含多个(start, end)时间元组
+            speed_factor: 速度倍数
+        """
+        start_time = time.time()
+        
+        # 创建进程池
+        with mp.Pool(processes=self.cpu_cores) as pool:
+            # 准备参数
+            args = [(self.input_path, start, end, speed_factor, i) 
+                   for i, (start, end) in enumerate(segment_times)]
+            
+            # 并行处理片段
+            pool.starmap(self._process_segment, args)
+
+        end_time = time.time()
+        print(f"批量处理完成！总用时: {end_time - start_time:.2f} 秒")
+
+    @staticmethod
+    def _process_segment(video_path: str, start: str, end: str, 
+                        speed_factor: float, index: int) -> None:
+        """处理单个视频片段"""
+        # 转换时间格式
+        start_sec = VideoSpeedProcessor._time_to_seconds(start)
+        end_sec = VideoSpeedProcessor._time_to_seconds(end)
+        
+        # 加载并处理视频片段
+        video = VideoFileClip(
+            video_path,
+            audio=True,
+            target_resolution=(720, None)
+        ).subclip(start_sec, end_sec)
+
+        # 应用速度变化
+        if speed_factor != 1.0:
+            video = speedx(video, factor=speed_factor)
+
+        # 保存处理后的片段
+        output_path = f"../../resource/videos/segment_{index}.mp4"
+        video.write_videofile(
+            output_path,
+            codec='libx264',
+            audio_codec='aac',
+            preset='ultrafast',
+            threads=2  # 每个进程使用的线程数
+        )
+        
+        video.close()
+
+    @staticmethod
+    def _time_to_seconds(time_str: str) -> float:
+        """将时间字符串(MM:SS)转换为秒数"""
+        minutes, seconds = map(int, time_str.split(':'))
+        return minutes * 60 + seconds
+
+
+def test_video_speed():
+    """测试视频加速处理"""
+    processor = VideoSpeedProcessor(
+        "../../resource/videos/best.mp4",
+        "../../resource/videos/speed_up.mp4"
+    )
+    
+    # 测试1：简单加速
+    processor.process_with_optimization(speed_factor=1.5)  # 1.5倍速
+    
+    # 测试2：并行处理多个片段
+    segments = [
+        ("00:00", "01:00"),
+        ("01:00", "02:00"),
+        ("02:00", "03:00")
+    ]
+    processor.batch_process_segments(segments, speed_factor=2.0)  # 2倍速
+
+
+if __name__ == "__main__":
+    test_video_speed()
diff --git a/app/test/test_qwen.py b/app/test/test_qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a692257b459d71c54e4d34925459403827aaf3d
--- /dev/null
+++ b/app/test/test_qwen.py
@@ -0,0 +1,105 @@
+import os
+import traceback
+import json
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import List
+from app.utils import utils
+from app.services.subtitle import extract_audio_and_create_subtitle
+
+
+class Step(BaseModel):
+    timestamp: str
+    picture: str
+    narration: str
+    OST: int
+    new_timestamp: str
+
+class MathReasoning(BaseModel):
+    result: List[Step]
+
+
+def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
+    """
+    与通义千问AI模型进行对话
+    
+    Args:
+        prompt (str): 用户输入的问题或提示
+        system_message (str): 系统提示信息，用于设定AI助手的行为。默认为"You are a helpful assistant."
+        subtitle_path (str): 字幕文件路径
+    Returns:
+        str: AI助手的回复内容
+
+    Raises:
+        Exception: 当API调用失败时抛出异常
+    """
+    try:
+        client = OpenAI(
+            api_key="sk-a1acd853d88d41d3ae92777d7bfa2612",
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+
+        # 读取字幕文件
+        with open(subtitle_path, "r", encoding="utf-8") as file:
+            subtitle_content = file.read()
+
+        completion = client.chat.completions.create(
+            model="qwen-turbo-2024-11-01",
+            messages=[
+                {'role': 'system', 'content': system_message},
+                {'role': 'user', 'content': prompt + subtitle_content}
+            ]
+        )
+        return completion.choices[0].message.content
+
+    except Exception as e:
+        error_message = f"调用千问API时发生错误：{str(e)}"
+        print(error_message)
+        print("请参考文档：https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
+        raise Exception(error_message)
+
+
+# 使用示例
+if __name__ == "__main__":
+    try:
+        video_path = utils.video_dir("duanju_yuansp.mp4")
+        # # 判断视频是否存在
+        # if not os.path.exists(video_path):
+        #     print(f"视频文件不存在：{video_path}")
+        #     exit(1)
+        # 提取字幕
+        subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
+        extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
+        # 分析字幕
+        system_message = """
+        你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。
+        输出需严格按照如下 json 格式:
+        [
+            {
+                "timestamp": "00:00:50,020-00,01:44,000",
+                "picture": "画面1",
+                "narration": "播放原声",
+                "OST": 0,
+                "new_timestamp": "00:00:00,000-00:00:54,020"
+            },
+            {
+                "timestamp": "01:49-02:30",
+                "picture": "画面2",
+                "narration": "播放原声",
+                "OST": 2,
+                "new_timestamp": "00:54-01:35"
+            },
+        ]
+        """
+        prompt = "字幕如下：\n"
+        response = chat_with_qwen(prompt, system_message, subtitle_path)
+        print(response)
+        # 保存json，注意json中是时间戳需要转换为 分:秒(现在的时间是 "timestamp": "00:00:00,020-00:00:01,660", 需要转换为 "timestamp": "00:00-01:66")
+        # response = json.loads(response)
+        # for item in response:
+        #     item["timestamp"] = item["timestamp"].replace(":", "-")
+        # with open(os.path.join(utils.video_dir(""), "duanju_yuan.json"), "w", encoding="utf-8") as file:
+        #     json.dump(response, file, ensure_ascii=False)
+
+    except Exception as e:
+        print(traceback.format_exc())
diff --git a/app/utils/check_script.py b/app/utils/check_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..00e6c0ffd2f4b836230b0cabeb683154fe0e7a0d
--- /dev/null
+++ b/app/utils/check_script.py
@@ -0,0 +1,81 @@
+import json
+from typing import Dict, Any
+
+def check_format(script_content: str) -> Dict[str, Any]:
+    """检查脚本格式
+    Args:
+        script_content: 脚本内容
+    Returns:
+        Dict: {'success': bool, 'message': str}
+    """
+    try:
+        # 检查是否为有效的JSON
+        data = json.loads(script_content)
+        
+        # 检查是否为列表
+        if not isinstance(data, list):
+            return {
+                'success': False,
+                'message': '脚本必须是JSON数组格式'
+            }
+        
+        # 检查每个片段
+        for i, clip in enumerate(data):
+            # 检查必需字段
+            required_fields = ['narration', 'picture', 'timestamp']
+            for field in required_fields:
+                if field not in clip:
+                    return {
+                        'success': False,
+                        'message': f'第{i+1}个片段缺少必需字段: {field}'
+                    }
+            
+            # 检查字段类型
+            if not isinstance(clip['narration'], str):
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的narration必须是字符串'
+                }
+            if not isinstance(clip['picture'], str):
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的picture必须是字符串'
+                }
+            if not isinstance(clip['timestamp'], str):
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的timestamp必须是字符串'
+                }
+            
+            # 检查字段内容不能为空
+            if not clip['narration'].strip():
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的narration不能为空'
+                }
+            if not clip['picture'].strip():
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的picture不能为空'
+                }
+            if not clip['timestamp'].strip():
+                return {
+                    'success': False,
+                    'message': f'第{i+1}个片段的timestamp不能为空'
+                }
+
+        return {
+            'success': True,
+            'message': '脚本格式检查通过'
+        }
+
+    except json.JSONDecodeError as e:
+        return {
+            'success': False,
+            'message': f'JSON格式错误: {str(e)}'
+        }
+    except Exception as e:
+        return {
+            'success': False,
+            'message': f'检查过程中发生错误: {str(e)}'
+        }
diff --git a/app/utils/ffmpeg_utils.py b/app/utils/ffmpeg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ae83d444e07301906339d0a2c56412a3b38ffc
--- /dev/null
+++ b/app/utils/ffmpeg_utils.py
@@ -0,0 +1,513 @@
+"""
+FFmpeg 工具模块 - 提供 FFmpeg 相关的工具函数，特别是硬件加速检测
+"""
+import os
+import platform
+import subprocess
+from typing import Dict, List, Optional, Tuple, Union
+from loguru import logger
+
+# 全局变量，存储检测到的硬件加速信息
+_FFMPEG_HW_ACCEL_INFO = {
+    "available": False,
+    "type": None,
+    "encoder": None,
+    "hwaccel_args": [],
+    "message": "",
+    "is_dedicated_gpu": False
+}
+
+
+def check_ffmpeg_installation() -> bool:
+    """
+    检查ffmpeg是否已安装
+
+    Returns:
+        bool: 如果安装则返回True，否则返回False
+    """
+    try:
+        # 在Windows系统上使用UTF-8编码
+        is_windows = os.name == 'nt'
+        if is_windows:
+            subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', check=True)
+        else:
+            subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        logger.error("ffmpeg未安装或不在系统PATH中，请安装ffmpeg")
+        return False
+
+
+def detect_hardware_acceleration() -> Dict[str, Union[bool, str, List[str], None]]:
+    """
+    检测系统可用的硬件加速器，并存储结果到全局变量
+
+    Returns:
+        Dict: 包含硬件加速信息的字典
+    """
+    global _FFMPEG_HW_ACCEL_INFO
+
+    # 如果已经检测过，直接返回结果
+    if _FFMPEG_HW_ACCEL_INFO["type"] is not None:
+        return _FFMPEG_HW_ACCEL_INFO
+
+    # 检查ffmpeg是否已安装
+    if not check_ffmpeg_installation():
+        _FFMPEG_HW_ACCEL_INFO["message"] = "FFmpeg未安装或不在系统PATH中"
+        return _FFMPEG_HW_ACCEL_INFO
+
+    # 检测操作系统
+    system = platform.system().lower()
+    logger.debug(f"检测硬件加速 - 操作系统: {system}")
+
+    # 获取FFmpeg支持的硬件加速器列表
+    try:
+        # 在Windows系统上使用UTF-8编码
+        is_windows = os.name == 'nt'
+        if is_windows:
+            hwaccels_cmd = subprocess.run(
+                ['ffmpeg', '-hide_banner', '-hwaccels'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', text=True
+            )
+        else:
+            hwaccels_cmd = subprocess.run(
+                ['ffmpeg', '-hide_banner', '-hwaccels'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+            )
+        supported_hwaccels = hwaccels_cmd.stdout.lower()
+    except Exception as e:
+        logger.error(f"获取FFmpeg硬件加速器列表失败: {str(e)}")
+        supported_hwaccels = ""
+
+    # 根据操作系统检测不同的硬件加速器
+    if system == 'darwin':  # macOS
+        _detect_macos_acceleration(supported_hwaccels)
+    elif system == 'windows':  # Windows
+        _detect_windows_acceleration(supported_hwaccels)
+    elif system == 'linux':  # Linux
+        _detect_linux_acceleration(supported_hwaccels)
+    else:
+        logger.warning(f"不支持的操作系统: {system}")
+        _FFMPEG_HW_ACCEL_INFO["message"] = f"不支持的操作系统: {system}"
+
+    # 记录检测结果已经在启动时输出，这里不再重复输出
+
+    return _FFMPEG_HW_ACCEL_INFO
+
+
+def _detect_macos_acceleration(supported_hwaccels: str) -> None:
+    """
+    检测macOS系统的硬件加速
+
+    Args:
+        supported_hwaccels: FFmpeg支持的硬件加速器列表
+    """
+    global _FFMPEG_HW_ACCEL_INFO
+
+    if 'videotoolbox' in supported_hwaccels:
+        # 测试videotoolbox
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+            )
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "videotoolbox"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_videotoolbox"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "videotoolbox"]
+                # macOS的Metal GPU加速通常是集成GPU
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = False
+                return
+        except Exception as e:
+            logger.debug(f"测试videotoolbox失败: {str(e)}")
+
+    _FFMPEG_HW_ACCEL_INFO["message"] = "macOS系统未检测到可用的videotoolbox硬件加速"
+
+
+def _detect_windows_acceleration(supported_hwaccels: str) -> None:
+    """
+    检测Windows系统的硬件加速
+
+    Args:
+        supported_hwaccels: FFmpeg支持的硬件加速器列表
+    """
+    global _FFMPEG_HW_ACCEL_INFO
+
+    # 在Windows上，首先检查显卡信息
+    gpu_info = _get_windows_gpu_info()
+
+    # 检查是否为AMD显卡
+    if 'amd' in gpu_info.lower() or 'radeon' in gpu_info.lower():
+        logger.info("检测到AMD显卡，为避免兼容性问题，将使用软件编码")
+        _FFMPEG_HW_ACCEL_INFO["message"] = "检测到AMD显卡，为避免兼容性问题，将使用软件编码"
+        return
+
+    # 检查是否为Intel集成显卡
+    is_intel_integrated = False
+    if 'intel' in gpu_info.lower() and ('hd graphics' in gpu_info.lower() or 'uhd graphics' in gpu_info.lower()):
+        logger.info("检测到Intel集成显卡")
+        is_intel_integrated = True
+
+    # 检测NVIDIA CUDA支持
+    if 'cuda' in supported_hwaccels and 'nvidia' in gpu_info.lower():
+        # 添加调试日志
+        logger.debug(f"Windows检测到NVIDIA显卡，尝试CUDA加速")
+        try:
+            # 先检查NVENC编码器是否可用，使用UTF-8编码
+            encoders_cmd = subprocess.run(
+                ["ffmpeg", "-hide_banner", "-encoders"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+            has_nvenc = "h264_nvenc" in encoders_cmd.stdout.lower()
+            logger.debug(f"NVENC编码器检测结果: {'可用' if has_nvenc else '不可用'}")
+
+            # 测试CUDA硬件加速，使用UTF-8编码
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "cuda", "-i", "NUL", "-f", "null", "-t", "0.1", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+            # 记录详细的返回信息以便调试
+            logger.debug(f"CUDA测试返回码: {test_cmd.returncode}")
+            logger.debug(f"CUDA测试错误输出: {test_cmd.stderr[:200]}..." if len(test_cmd.stderr) > 200 else f"CUDA测试错误输出: {test_cmd.stderr}")
+
+            if test_cmd.returncode == 0 or has_nvenc:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "cuda"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True
+                return
+
+            # 如果上面的测试失败，尝试另一种方式，使用UTF-8编码
+            test_cmd2 = subprocess.run(
+                ["ffmpeg", "-hide_banner", "-loglevel", "error", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda", "-i", "NUL", "-f", "null", "-t", "0.1", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+            if test_cmd2.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "cuda"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda", "-hwaccel_output_format", "cuda"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True
+                return
+        except Exception as e:
+            logger.debug(f"测试CUDA失败: {str(e)}")
+
+    # 检测Intel QSV支持（如果是Intel显卡）
+    if 'qsv' in supported_hwaccels and 'intel' in gpu_info.lower():
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+            )
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "qsv"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_qsv"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "qsv"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = not is_intel_integrated
+                return
+        except Exception as e:
+            logger.debug(f"测试QSV失败: {str(e)}")
+
+    # 检测D3D11VA支持
+    if 'd3d11va' in supported_hwaccels:
+        logger.debug("Windows尝试D3D11VA加速")
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "d3d11va", "-i", "NUL", "-f", "null", "-t", "0.1", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+            # 记录详细的返回信息以便调试
+            logger.debug(f"D3D11VA测试返回码: {test_cmd.returncode}")
+
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "d3d11va"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264"  # D3D11VA只用于解码，编码仍使用软件编码器
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "d3d11va"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = not is_intel_integrated
+                return
+        except Exception as e:
+            logger.debug(f"测试D3D11VA失败: {str(e)}")
+
+    # 检测DXVA2支持
+    if 'dxva2' in supported_hwaccels:
+        logger.debug("Windows尝试DXVA2加速")
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "dxva2", "-i", "NUL", "-f", "null", "-t", "0.1", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+            # 记录详细的返回信息以便调试
+            logger.debug(f"DXVA2测试返回码: {test_cmd.returncode}")
+
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "dxva2"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264"  # DXVA2只用于解码，编码仍使用软件编码器
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "dxva2"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = not is_intel_integrated
+                return
+        except Exception as e:
+            logger.debug(f"测试DXVA2失败: {str(e)}")
+
+    # 如果检测到NVIDIA显卡但前面的测试都失败，尝试直接使用NVENC编码器
+    if 'nvidia' in gpu_info.lower():
+        logger.debug("Windows检测到NVIDIA显卡，尝试直接使用NVENC编码器")
+        try:
+            # 检查NVENC编码器是否可用，使用UTF-8编码
+            encoders_cmd = subprocess.run(
+                ["ffmpeg", "-hide_banner", "-encoders"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+            if "h264_nvenc" in encoders_cmd.stdout.lower():
+                logger.debug("NVENC编码器可用，尝试直接使用")
+                # 测试NVENC编码器，使用UTF-8编码
+                test_cmd = subprocess.run(
+                    ["ffmpeg", "-f", "lavfi", "-i", "color=c=black:s=640x360:r=30", "-c:v", "h264_nvenc", "-t", "0.1", "-f", "null", "-"],
+                    stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False
+                )
+
+                logger.debug(f"NVENC编码器测试返回码: {test_cmd.returncode}")
+
+                if test_cmd.returncode == 0:
+                    _FFMPEG_HW_ACCEL_INFO["available"] = True
+                    _FFMPEG_HW_ACCEL_INFO["type"] = "nvenc"
+                    _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc"
+                    _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = []  # 不使用hwaccel参数，直接使用编码器
+                    _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True
+                    return
+        except Exception as e:
+            logger.debug(f"测试NVENC编码器失败: {str(e)}")
+
+    _FFMPEG_HW_ACCEL_INFO["message"] = f"Windows系统未检测到可用的硬件加速，显卡信息: {gpu_info}"
+
+
+def _detect_linux_acceleration(supported_hwaccels: str) -> None:
+    """
+    检测Linux系统的硬件加速
+
+    Args:
+        supported_hwaccels: FFmpeg支持的硬件加速器列表
+    """
+    global _FFMPEG_HW_ACCEL_INFO
+
+    # 获取Linux显卡信息
+    gpu_info = _get_linux_gpu_info()
+    is_nvidia = 'nvidia' in gpu_info.lower()
+    is_intel = 'intel' in gpu_info.lower()
+    is_amd = 'amd' in gpu_info.lower() or 'radeon' in gpu_info.lower()
+
+    # 检测NVIDIA CUDA支持
+    if 'cuda' in supported_hwaccels and is_nvidia:
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+            )
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "cuda"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True
+                return
+        except Exception as e:
+            logger.debug(f"测试CUDA失败: {str(e)}")
+
+    # 检测VAAPI支持
+    if 'vaapi' in supported_hwaccels:
+        # 检查是否存在渲染设备
+        render_devices = ['/dev/dri/renderD128', '/dev/dri/renderD129']
+        render_device = None
+        for device in render_devices:
+            if os.path.exists(device):
+                render_device = device
+                break
+
+        if render_device:
+            try:
+                test_cmd = subprocess.run(
+                    ["ffmpeg", "-hwaccel", "vaapi", "-vaapi_device", render_device,
+                     "-i", "/dev/null", "-f", "null", "-"],
+                    stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+                )
+                if test_cmd.returncode == 0:
+                    _FFMPEG_HW_ACCEL_INFO["available"] = True
+                    _FFMPEG_HW_ACCEL_INFO["type"] = "vaapi"
+                    _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_vaapi"
+                    _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "vaapi", "-vaapi_device", render_device]
+                    # 根据显卡类型判断是否为独立显卡
+                    _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = is_nvidia or (is_amd and not is_intel)
+                    return
+            except Exception as e:
+                logger.debug(f"测试VAAPI失败: {str(e)}")
+
+    # 检测Intel QSV支持
+    if 'qsv' in supported_hwaccels and is_intel:
+        try:
+            test_cmd = subprocess.run(
+                ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
+                stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+            )
+            if test_cmd.returncode == 0:
+                _FFMPEG_HW_ACCEL_INFO["available"] = True
+                _FFMPEG_HW_ACCEL_INFO["type"] = "qsv"
+                _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_qsv"
+                _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "qsv"]
+                _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = False  # Intel QSV通常是集成GPU
+                return
+        except Exception as e:
+            logger.debug(f"测试QSV失败: {str(e)}")
+
+    _FFMPEG_HW_ACCEL_INFO["message"] = f"Linux系统未检测到可用的硬件加速，显卡信息: {gpu_info}"
+
+
+def _get_windows_gpu_info() -> str:
+    """
+    获取Windows系统的显卡信息
+
+    Returns:
+        str: 显卡信息字符串
+    """
+    try:
+        # 使用PowerShell获取更可靠的显卡信息，并使用UTF-8编码
+        gpu_info = subprocess.run(
+            ['powershell', '-Command', "Get-WmiObject Win32_VideoController | Select-Object Name | Format-List"],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', text=True, check=False
+        )
+
+        # 如果PowerShell失败，尝试使用wmic
+        if not gpu_info.stdout.strip():
+            gpu_info = subprocess.run(
+                ['wmic', 'path', 'win32_VideoController', 'get', 'name'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', text=True, check=False
+            )
+
+        # 记录详细的显卡信息以便调试
+        logger.debug(f"Windows显卡信息: {gpu_info.stdout}")
+        return gpu_info.stdout
+    except Exception as e:
+        logger.warning(f"获取Windows显卡信息失败: {str(e)}")
+        return "Unknown GPU"
+
+
+def _get_linux_gpu_info() -> str:
+    """
+    获取Linux系统的显卡信息
+
+    Returns:
+        str: 显卡信息字符串
+    """
+    try:
+        # 尝试使用lspci命令
+        gpu_info = subprocess.run(
+            ['lspci', '-v', '-nn', '|', 'grep', '-i', 'vga\\|display'],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, check=False
+        )
+        if gpu_info.stdout:
+            return gpu_info.stdout
+
+        # 如果lspci命令失败，尝试使用glxinfo
+        gpu_info = subprocess.run(
+            ['glxinfo', '|', 'grep', '-i', 'vendor\\|renderer'],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, check=False
+        )
+        if gpu_info.stdout:
+            return gpu_info.stdout
+
+        return "Unknown GPU"
+    except Exception as e:
+        logger.warning(f"获取Linux显卡信息失败: {str(e)}")
+        return "Unknown GPU"
+
+
+def get_ffmpeg_hwaccel_args() -> List[str]:
+    """
+    获取FFmpeg硬件加速参数
+
+    Returns:
+        List[str]: FFmpeg硬件加速参数列表
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO["hwaccel_args"]
+
+
+def get_ffmpeg_hwaccel_type() -> Optional[str]:
+    """
+    获取FFmpeg硬件加速类型
+
+    Returns:
+        Optional[str]: 硬件加速类型，如果不支持则返回None
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO["type"] if _FFMPEG_HW_ACCEL_INFO["available"] else None
+
+
+def get_ffmpeg_hwaccel_encoder() -> Optional[str]:
+    """
+    获取FFmpeg硬件加速编码器
+
+    Returns:
+        Optional[str]: 硬件加速编码器，如果不支持则返回None
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO["encoder"] if _FFMPEG_HW_ACCEL_INFO["available"] else None
+
+
+def get_ffmpeg_hwaccel_info() -> Dict[str, Union[bool, str, List[str], None]]:
+    """
+    获取FFmpeg硬件加速信息
+
+    Returns:
+        Dict: 包含硬件加速信息的字典
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO
+
+
+def is_ffmpeg_hwaccel_available() -> bool:
+    """
+    检查是否有可用的FFmpeg硬件加速
+
+    Returns:
+        bool: 如果有可用的硬件加速则返回True，否则返回False
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO["available"]
+
+
+def is_dedicated_gpu() -> bool:
+    """
+    检查是否使用独立显卡进行硬件加速
+
+    Returns:
+        bool: 如果使用独立显卡则返回True，否则返回False
+    """
+    # 如果还没有检测过，先进行检测
+    if _FFMPEG_HW_ACCEL_INFO["type"] is None:
+        detect_hardware_acceleration()
+
+    return _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"]
diff --git a/app/utils/gemini_analyzer.py b/app/utils/gemini_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7236a9e39f6ee0af1c7f1f5eb4bedbc82be6c20c
--- /dev/null
+++ b/app/utils/gemini_analyzer.py
@@ -0,0 +1,217 @@
+import json
+from typing import List, Union, Dict
+import os
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+import asyncio
+from tenacity import retry, stop_after_attempt, RetryError, retry_if_exception_type, wait_exponential
+from google.api_core import exceptions
+import google.generativeai as genai
+import PIL.Image
+import traceback
+from app.utils import utils
+
+
+class VisionAnalyzer:
+    """视觉分析器类"""
+
+    def __init__(self, model_name: str = "gemini-1.5-flash", api_key: str = None):
+        """初始化视觉分析器"""
+        if not api_key:
+            raise ValueError("必须提供API密钥")
+
+        self.model_name = model_name
+        self.api_key = api_key
+
+        # 初始化配置
+        self._configure_client()
+
+    def _configure_client(self):
+        """配置API客户端"""
+        genai.configure(api_key=self.api_key)
+        # 开放 Gemini 模型安全设置
+        from google.generativeai.types import HarmCategory, HarmBlockThreshold
+        safety_settings = {
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+        self.model = genai.GenerativeModel(self.model_name, safety_settings=safety_settings)
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(exceptions.ResourceExhausted)
+    )
+    async def _generate_content_with_retry(self, prompt, batch):
+        """使用重试机制的内部方法来调用 generate_content_async"""
+        try:
+            return await self.model.generate_content_async([prompt, *batch])
+        except exceptions.ResourceExhausted as e:
+            print(f"API配额限制: {str(e)}")
+            raise RetryError("API调用失败")
+
+    async def analyze_images(self,
+                           images: Union[List[str], List[PIL.Image.Image]],
+                           prompt: str,
+                           batch_size: int) -> List[Dict]:
+        """批量分析多张图片"""
+        try:
+            # 加载图片
+            if isinstance(images[0], str):
+                images = self.load_images(images)
+
+            # 验证图片列表
+            if not images:
+                raise ValueError("图片列表为空")
+
+            # 验证每个图片对象
+            valid_images = []
+            for i, img in enumerate(images):
+                if not isinstance(img, PIL.Image.Image):
+                    logger.error(f"无效的图片对象，索引 {i}: {type(img)}")
+                    continue
+                valid_images.append(img)
+
+            if not valid_images:
+                raise ValueError("没有有效的图片对象")
+
+            images = valid_images
+            results = []
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")
+
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
+                for i in range(0, len(images), batch_size):
+                    batch = images[i:i + batch_size]
+                    retry_count = 0
+
+                    while retry_count < 3:
+                        try:
+                            # 在每个批次处理前添加小延迟
+                            # if i > 0:
+                            #     await asyncio.sleep(2)
+
+                            # 确保每个批次的图片都是有效的
+                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
+                            if not valid_batch:
+                                raise ValueError(f"批次 {i // batch_size} 中没有有效的图片")
+
+                            response = await self._generate_content_with_retry(prompt, valid_batch)
+                            results.append({
+                                'batch_index': i // batch_size,
+                                'images_processed': len(valid_batch),
+                                'response': response.text,
+                                'model_used': self.model_name
+                            })
+                            break
+
+                        except Exception as e:
+                            retry_count += 1
+                            error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}"
+                            logger.error(error_msg)
+
+                            if retry_count >= 3:
+                                results.append({
+                                    'batch_index': i // batch_size,
+                                    'images_processed': len(batch),
+                                    'error': error_msg,
+                                    'model_used': self.model_name
+                                })
+                            else:
+                                logger.info(f"批次 {i // batch_size} 处理失败，等待60秒后重试当前批次...")
+                                await asyncio.sleep(60)
+
+                    pbar.update(1)
+
+            return results
+
+        except Exception as e:
+            error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+
+    def save_results_to_txt(self, results: List[Dict], output_dir: str):
+        """将分析结果保存到txt文件"""
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+
+        for result in results:
+            if not result.get('image_paths'):
+                continue
+
+            response_text = result['response']
+            image_paths = result['image_paths']
+
+            # 从文件名中提取时间戳并转换为标准格式
+            def format_timestamp(img_path):
+                # 从文件名中提取时间部分
+                timestamp = Path(img_path).stem.split('_')[-1]
+                try:
+                    # 将时间转换为秒
+                    seconds = utils.time_to_seconds(timestamp.replace('_', ':'))
+                    # 转换为 HH:MM:SS,mmm 格式
+                    hours = int(seconds // 3600)
+                    minutes = int((seconds % 3600) // 60)
+                    seconds_remainder = seconds % 60
+                    whole_seconds = int(seconds_remainder)
+                    milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                    
+                    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+                except Exception as e:
+                    logger.error(f"时间戳格式转换错误: {timestamp}, {str(e)}")
+                    return timestamp
+
+            start_timestamp = format_timestamp(image_paths[0])
+            end_timestamp = format_timestamp(image_paths[-1])
+            
+            txt_path = os.path.join(output_dir, f"frame_{start_timestamp}_{end_timestamp}.txt")
+
+            # 保存结果到txt文件
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(response_text.strip())
+            logger.info(f"已保存分析结果到: {txt_path}")
+
+    def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]:
+        """
+        加载多张图片
+        Args:
+            image_paths: 图片路径列表
+        Returns:
+            加载后的PIL Image对象列表
+        """
+        images = []
+        failed_images = []
+
+        for img_path in image_paths:
+            try:
+                if not os.path.exists(img_path):
+                    logger.error(f"图片文件不存在: {img_path}")
+                    failed_images.append(img_path)
+                    continue
+
+                img = PIL.Image.open(img_path)
+                # 确保图片被完全加载
+                img.load()
+                # 转换为RGB模式
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                images.append(img)
+
+            except Exception as e:
+                logger.error(f"无法加载图片 {img_path}: {str(e)}")
+                failed_images.append(img_path)
+
+        if failed_images:
+            logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}")
+
+        if not images:
+            raise ValueError("没有成功加载任何图片")
+
+        return images
\ No newline at end of file
diff --git a/app/utils/qwenvl_analyzer.py b/app/utils/qwenvl_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d1669abb80aa01f6f51a9a0bad6580661262ddd
--- /dev/null
+++ b/app/utils/qwenvl_analyzer.py
@@ -0,0 +1,269 @@
+import json
+from typing import List, Union, Dict
+import os
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+import asyncio
+from tenacity import retry, stop_after_attempt, RetryError, wait_exponential
+from openai import OpenAI
+import PIL.Image
+import base64
+import io
+import traceback
+
+
+class QwenAnalyzer:
+    """千问视觉分析器类"""
+
+    def __init__(self, model_name: str = "qwen-vl-max-latest", api_key: str = None, base_url: str = None):
+        """
+        初始化千问视觉分析器
+        
+        Args:
+            model_name: 模型名称，默认使用 qwen-vl-max-latest
+            api_key: 阿里云API密钥
+            base_url: API基础URL，如果为None则使用默认值
+        """
+        if not api_key:
+            raise ValueError("必须提供API密钥")
+
+        self.model_name = model_name
+        self.api_key = api_key
+        self.base_url = base_url
+
+        # 配置API客户端
+        self._configure_client()
+
+    def _configure_client(self):
+        """
+        配置API客户端
+        使用最简化的参数配置，避免不必要的参数
+        """
+        try:
+            self.client = OpenAI(
+                api_key=self.api_key,
+                base_url=self.base_url
+            )
+        except Exception as e:
+            logger.error(f"初始化OpenAI客户端失败: {str(e)}")
+            raise
+
+    def _image_to_base64(self, image: PIL.Image.Image) -> str:
+        """
+        将PIL图片对象转换为base64字符串
+        """
+        buffered = io.BytesIO()
+        image.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10)
+    )
+    async def _generate_content_with_retry(self, prompt: str, batch: List[PIL.Image.Image]):
+        """使用重试机制的内部方法来调用千问API"""
+        try:
+            # 构建消息内容
+            content = []
+
+            # 添加图片
+            for img in batch:
+                base64_image = self._image_to_base64(img)
+                content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                })
+
+            # 添加文本提示
+            content.append({
+                "type": "text",
+                "text": prompt % (len(content), len(content), len(content))
+            })
+
+            # 调用API
+            response = await asyncio.to_thread(
+                self.client.chat.completions.create,
+                model=self.model_name,
+                messages=[{
+                    "role": "user",
+                    "content": content
+                }]
+            )
+
+            return response.choices[0].message.content
+
+        except Exception as e:
+            logger.error(f"API调用错误: {str(e)}")
+            raise RetryError("API调用失败")
+
+    async def analyze_images(self,
+                             images: Union[List[str], List[PIL.Image.Image]],
+                             prompt: str,
+                             batch_size: int) -> List[Dict]:
+        """
+        批量分析多张图片
+        Args:
+            images: 图片路径列表或PIL图片对象列表
+            prompt: 分析提示词
+            batch_size: 批处理大小
+        Returns:
+            分析结果列表
+        """
+        try:
+            # 保存原始图片路径（如果是路径列表的话）
+            original_paths = images if isinstance(images[0], str) else None
+
+            # 加载图片
+            if isinstance(images[0], str):
+                images = self.load_images(images)
+
+            # 验证图片列表
+            if not images:
+                raise ValueError("图片列表为空")
+
+            # 验证每个图片对象
+            valid_images = []
+            valid_paths = []
+            for i, img in enumerate(images):
+                if not isinstance(img, PIL.Image.Image):
+                    logger.error(f"无效的图片对象，索引 {i}: {type(img)}")
+                    continue
+                valid_images.append(img)
+                if original_paths:
+                    valid_paths.append(original_paths[i])
+
+            if not valid_images:
+                raise ValueError("没有有效的图片对象")
+
+            images = valid_images
+            results = []
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")
+
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
+                for i in range(0, len(images), batch_size):
+                    batch = images[i:i + batch_size]
+                    batch_paths = valid_paths[i:i + batch_size] if valid_paths else None
+                    retry_count = 0
+
+                    while retry_count < 3:
+                        try:
+                            # 在每个批次处理前添加小延迟
+                            # if i > 0:
+                            #     await asyncio.sleep(0.5)
+
+                            # 确保每个批次的图片都是有效的
+                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
+                            if not valid_batch:
+                                raise ValueError(f"批次 {i // batch_size} 中没有有效的图片")
+
+                            response = await self._generate_content_with_retry(prompt, valid_batch)
+                            result_dict = {
+                                'batch_index': i // batch_size,
+                                'images_processed': len(valid_batch),
+                                'response': response,
+                                'model_used': self.model_name
+                            }
+
+                            # 添加图片路径信息（如果有的话）
+                            if batch_paths:
+                                result_dict['image_paths'] = batch_paths
+
+                            results.append(result_dict)
+                            break
+
+                        except Exception as e:
+                            retry_count += 1
+                            error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}"
+                            logger.error(error_msg)
+
+                            if retry_count >= 3:
+                                results.append({
+                                    'batch_index': i // batch_size,
+                                    'images_processed': len(batch),
+                                    'error': error_msg,
+                                    'model_used': self.model_name,
+                                    'image_paths': batch_paths if batch_paths else []
+                                })
+                            else:
+                                logger.info(f"批次 {i // batch_size} 处理失败，等待60秒后重试当前批次...")
+                                await asyncio.sleep(60)
+
+                    pbar.update(1)
+
+            return results
+
+        except Exception as e:
+            error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+
+    def save_results_to_txt(self, results: List[Dict], output_dir: str):
+        """将分析结果保存到txt文件"""
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+
+        for i, result in enumerate(results):
+            response_text = result['response']
+
+            # 如果有图片路径信息，用它来生成文件名
+            if result.get('image_paths'):
+                image_paths = result['image_paths']
+                img_name_start = Path(image_paths[0]).stem.split('_')[-1]
+                img_name_end = Path(image_paths[-1]).stem.split('_')[-1]
+                file_name = f"frame_{img_name_start}_{img_name_end}.txt"
+            else:
+                # 如果没有路径信息，使用批次索引
+                file_name = f"batch_{result['batch_index']}.txt"
+
+            txt_path = os.path.join(output_dir, file_name)
+
+            # 保存结果到txt文件
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(response_text.strip())
+            logger.info(f"已保存分析结果到: {txt_path}")
+
+    def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]:
+        """
+        加载多张图片
+        Args:
+            image_paths: 图片路径列表
+        Returns:
+            加载后的PIL Image对象列表
+        """
+        images = []
+        failed_images = []
+
+        for img_path in image_paths:
+            try:
+                if not os.path.exists(img_path):
+                    logger.error(f"图片文件不存在: {img_path}")
+                    failed_images.append(img_path)
+                    continue
+
+                img = PIL.Image.open(img_path)
+                # 确保图片被完全加载
+                img.load()
+                # 转换为RGB模式
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                images.append(img)
+
+            except Exception as e:
+                logger.error(f"无法加载图片 {img_path}: {str(e)}")
+                failed_images.append(img_path)
+
+        if failed_images:
+            logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}")
+
+        if not images:
+            raise ValueError("没有成功加载任何图片")
+
+        return images
diff --git a/app/utils/script_generator.py b/app/utils/script_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..70207828bb3d919e1b864ac41729409fa70030be
--- /dev/null
+++ b/app/utils/script_generator.py
@@ -0,0 +1,518 @@
+import os
+import json
+import traceback
+from loguru import logger
+# import tiktoken
+from typing import List, Dict
+from datetime import datetime
+from openai import OpenAI
+import google.generativeai as genai
+import time
+
+
+class BaseGenerator:
+    def __init__(self, model_name: str, api_key: str, prompt: str):
+        self.model_name = model_name
+        self.api_key = api_key
+        self.base_prompt = prompt
+        self.conversation_history = []
+        self.chunk_overlap = 50
+        self.last_chunk_ending = ""
+        self.default_params = {
+            "temperature": 0.7,
+            "max_tokens": 500,
+            "top_p": 0.9,
+            "frequency_penalty": 0.3,
+            "presence_penalty": 0.5
+        }
+
+    def _try_generate(self, messages: list, params: dict = None) -> str:
+        max_attempts = 3
+        tolerance = 5
+        
+        for attempt in range(max_attempts):
+            try:
+                response = self._generate(messages, params or self.default_params)
+                return self._process_response(response)
+            except Exception as e:
+                if attempt == max_attempts - 1:
+                    raise
+                logger.warning(f"Generation attempt {attempt + 1} failed: {str(e)}")
+                continue
+        return ""
+
+    def _generate(self, messages: list, params: dict) -> any:
+        raise NotImplementedError
+        
+    def _process_response(self, response: any) -> str:
+        return response
+
+    def generate_script(self, scene_description: str, word_count: int) -> str:
+        """生成脚本的通用方法"""
+        prompt = f"""{self.base_prompt}
+
+上一段文案的结尾：{self.last_chunk_ending if self.last_chunk_ending else "这是第一段，无需考虑上文"}
+
+当前画面描述：{scene_description}
+
+请确保新生成的文案与上文自然衔接，保持叙事的连贯性和趣味性。
+不要出现除了文案以外的其他任何内容；
+严格字数要求：{word_count}字，允许误差±5字。"""
+
+        messages = [
+            {"role": "system", "content": self.base_prompt},
+            {"role": "user", "content": prompt}
+        ]
+
+        try:
+            generated_script = self._try_generate(messages, self.default_params)
+            
+            # 更新上下文
+            if generated_script:
+                self.last_chunk_ending = generated_script[-self.chunk_overlap:] if len(
+                    generated_script) > self.chunk_overlap else generated_script
+                
+            return generated_script
+            
+        except Exception as e:
+            logger.error(f"Script generation failed: {str(e)}")
+            raise
+
+
+class OpenAIGenerator(BaseGenerator):
+    """OpenAI API 生成器实现"""
+    def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str):
+        super().__init__(model_name, api_key, prompt)
+        base_url = base_url or f"https://api.openai.com/v1"
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.max_tokens = 5000
+        
+        # OpenAI特定参数
+        self.default_params = {
+            **self.default_params,
+            "stream": False,
+            "user": "script_generator"
+        }
+        
+        # # 初始化token计数器
+        # try:
+        #     self.encoding = tiktoken.encoding_for_model(self.model_name)
+        # except KeyError:
+        #     logger.warning(f"未找到模型 {self.model_name} 的专用编码器，使用默认编码器")
+        #     self.encoding = tiktoken.get_encoding("cl100k_base")
+
+    def _generate(self, messages: list, params: dict) -> any:
+        """实现OpenAI特定的生成逻辑"""
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                **params
+            )
+            return response
+        except Exception as e:
+            logger.error(f"OpenAI generation error: {str(e)}")
+            raise
+
+    def _process_response(self, response: any) -> str:
+        """处理OpenAI的响应"""
+        if not response or not response.choices:
+            raise ValueError("Invalid response from OpenAI API")
+        return response.choices[0].message.content.strip()
+
+    def _count_tokens(self, messages: list) -> int:
+        """计算token数量"""
+        num_tokens = 0
+        for message in messages:
+            num_tokens += 3
+            for key, value in message.items():
+                num_tokens += len(self.encoding.encode(str(value)))
+                if key == "role":
+                    num_tokens += 1
+        num_tokens += 3
+        return num_tokens
+
+
+class GeminiGenerator(BaseGenerator):
+    """Google Gemini API 生成器实现"""
+    def __init__(self, model_name: str, api_key: str, prompt: str):
+        super().__init__(model_name, api_key, prompt)
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel(model_name)
+        
+        # Gemini特定参数
+        self.default_params = {
+            "temperature": self.default_params["temperature"],
+            "top_p": self.default_params["top_p"],
+            "candidate_count": 1,
+            "stop_sequences": None
+        }
+
+    def _generate(self, messages: list, params: dict) -> any:
+        """实现Gemini特定的生成逻辑"""
+        while True:
+            try:
+                # 转换消息格式为Gemini格式
+                prompt = "\n".join([m["content"] for m in messages])
+                response = self.model.generate_content(
+                    prompt,
+                    generation_config=params
+                )
+                
+                # 检查响应是否包含有效内容
+                if (hasattr(response, 'result') and 
+                    hasattr(response.result, 'candidates') and 
+                    response.result.candidates):
+                    
+                    candidate = response.result.candidates[0]
+                    
+                    # 检查是否有内容字段
+                    if not hasattr(candidate, 'content'):
+                        logger.warning("Gemini API 返回速率限制响应，等待30秒后重试...")
+                        time.sleep(30)  # 等待3秒后重试
+                        continue
+                return response
+                
+            except Exception as e:
+                error_str = str(e)
+                if "429" in error_str:
+                    logger.warning("Gemini API 触发限流，等待65秒后重试...")
+                    time.sleep(65)  # 等待65秒后重试
+                    continue
+                else:
+                    logger.error(f"Gemini 生成文案错误: \n{error_str}")
+                    raise
+
+    def _process_response(self, response: any) -> str:
+        """处理Gemini的响应"""
+        if not response or not response.text:
+            raise ValueError("Invalid response from Gemini API")
+        return response.text.strip()
+
+
+class QwenGenerator(BaseGenerator):
+    """阿里云千问 API 生成器实现"""
+    def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str):
+        super().__init__(model_name, api_key, prompt)
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        )
+        
+        # Qwen特定参数
+        self.default_params = {
+            **self.default_params,
+            "stream": False,
+            "user": "script_generator"
+        }
+
+    def _generate(self, messages: list, params: dict) -> any:
+        """实现千问特定的生成逻辑"""
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                **params
+            )
+            return response
+        except Exception as e:
+            logger.error(f"Qwen generation error: {str(e)}")
+            raise
+
+    def _process_response(self, response: any) -> str:
+        """处理千问的响应"""
+        if not response or not response.choices:
+            raise ValueError("Invalid response from Qwen API")
+        return response.choices[0].message.content.strip()
+
+
+class MoonshotGenerator(BaseGenerator):
+    """Moonshot API 生成器实现"""
+    def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str):
+        super().__init__(model_name, api_key, prompt)
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=base_url or "https://api.moonshot.cn/v1"
+        )
+        
+        # Moonshot特定参数
+        self.default_params = {
+            **self.default_params,
+            "stream": False,
+            "stop": None,
+            "user": "script_generator",
+            "tools": None
+        }
+
+    def _generate(self, messages: list, params: dict) -> any:
+        """实现Moonshot特定的生成逻辑，包含429误重试机制"""
+        while True:
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=messages,
+                    **params
+                )
+                return response
+            except Exception as e:
+                error_str = str(e)
+                if "Error code: 429" in error_str:
+                    logger.warning("Moonshot API 触发限流，等待65秒后重试...")
+                    time.sleep(65)  # 等待65秒后重试
+                    continue
+                else:
+                    logger.error(f"Moonshot generation error: {error_str}")
+                    raise
+
+    def _process_response(self, response: any) -> str:
+        """处理Moonshot的响应"""
+        if not response or not response.choices:
+            raise ValueError("Invalid response from Moonshot API")
+        return response.choices[0].message.content.strip()
+
+
+class DeepSeekGenerator(BaseGenerator):
+    """DeepSeek API 生成器实现"""
+    def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str):
+        super().__init__(model_name, api_key, prompt)
+        self.client = OpenAI(
+            api_key=api_key,
+            base_url=base_url or "https://api.deepseek.com"
+        )
+        
+        # DeepSeek特定参数
+        self.default_params = {
+            **self.default_params,
+            "stream": False,
+            "user": "script_generator"
+        }
+
+    def _generate(self, messages: list, params: dict) -> any:
+        """实现DeepSeek特定的生成逻辑"""
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,  # deepseek-chat 或 deepseek-coder
+                messages=messages,
+                **params
+            )
+            return response
+        except Exception as e:
+            logger.error(f"DeepSeek generation error: {str(e)}")
+            raise
+
+    def _process_response(self, response: any) -> str:
+        """处理DeepSeek的响应"""
+        if not response or not response.choices:
+            raise ValueError("Invalid response from DeepSeek API")
+        return response.choices[0].message.content.strip()
+
+
+class ScriptProcessor:
+    def __init__(self, model_name: str, api_key: str = None, base_url: str = None, prompt: str = None, video_theme: str = ""):
+        self.model_name = model_name
+        self.api_key = api_key
+        self.base_url = base_url
+        self.video_theme = video_theme
+        self.prompt = prompt or self._get_default_prompt()
+
+        # 根据模型名称选择对应的生成器
+        logger.info(f"文本 LLM 提供商: {model_name}")
+        if 'gemini' in model_name.lower():
+            self.generator = GeminiGenerator(model_name, self.api_key, self.prompt)
+        elif 'qwen' in model_name.lower():
+            self.generator = QwenGenerator(model_name, self.api_key, self.prompt, self.base_url)
+        elif 'moonshot' in model_name.lower():
+            self.generator = MoonshotGenerator(model_name, self.api_key, self.prompt, self.base_url)
+        elif 'deepseek' in model_name.lower():
+            self.generator = DeepSeekGenerator(model_name, self.api_key, self.prompt, self.base_url)
+        else:
+            self.generator = OpenAIGenerator(model_name, self.api_key, self.prompt, self.base_url)
+
+    def _get_default_prompt(self) -> str:
+        return f"""
+        你是一位极具幽默感的短视频脚本创作大师，擅长用"温和的违反"制造笑点，让主题为 《{self.video_theme}》 的视频既有趣又富有传播力。
+你的任务是将视频画面描述转化为能在社交平台疯狂传播的爆款口播文案。
+
+目标受众：热爱生活、追求独特体验的18-35岁年轻人
+文案风格：基于HKRR理论 + 段子手精神
+主题：{self.video_theme}
+
+【创作核心理念】
+1. 敢于用"温和的违反"制造笑点，但不能过于冒犯
+2. 巧妙运用中国式幽默，让观众会心一笑
+3. 保持轻松愉快的叙事基调
+
+【爆款内容四要素】
+
+【快乐元素 Happy】
+1. 用调侃的语气描述画面
+2. 巧妙植入网络流行梗，增加内容的传播性
+3. 适时自嘲，展现真实且有趣的一面
+
+【知识价值 Knowledge】
+1. 用段子手的方式解释专业知识
+2. 在幽默中传递实用的生活常识
+
+【情感共鸣 Resonance】
+1. 描述"真实但夸张"的环境描述
+2. 把对自然的感悟融入俏皮话中
+3. 用接地气的表达方式拉近与观众距离
+
+【节奏控制 Rhythm】
+1. 像讲段子一样，注意铺垫和包袱的节奏
+2. 确保每段都有笑点，但不强求
+3. 段落结尾干净利落，不拖泥带水
+
+【连贯性要求】
+1. 新生成的内容必须自然衔接上一段文案的结尾
+2. 使用恰当的连接词和过渡语，确保叙事流畅
+3. 保持人物视角和语气的一致性
+4. 避免重复上一段已经提到的信息
+5. 确保情节的逻辑连续性
+
+我会按顺序提供多段视频画面描述。请创作既搞笑又能火爆全网的口播文案。
+记住：要敢于用"温和的违反"制造笑点，但要把握好尺度，让观众在轻松愉快中感受到乐趣。"""
+
+    def calculate_duration_and_word_count(self, time_range: str) -> int:
+        """
+        计算时间范围的持续时长并估算合适的字数
+        
+        Args:
+            time_range: 时间范围字符串,格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"
+                       例如: "00:00:50,100-00:01:21,500"
+        
+        Returns:
+            int: 估算的合适字数
+                  基于经验公式: 每0.35秒可以说一个字
+                  例如: 10秒可以说约28个字 (10/0.35≈28.57)
+        """
+        try:
+            start_str, end_str = time_range.split('-')
+            
+            def time_to_seconds(time_str: str) -> float:
+                """
+                将时间字符串转换为秒数(带毫秒精度)
+                
+                Args:
+                    time_str: 时间字符串,格式为 "HH:MM:SS,mmm"
+                             例如: "00:00:50,100" 表示50.1秒
+                
+                Returns:
+                    float: 转换后的秒数(带毫秒)
+                """
+                try:
+                    # 处理毫秒部分
+                    time_part, ms_part = time_str.split(',')
+                    hours, minutes, seconds = map(int, time_part.split(':'))
+                    milliseconds = int(ms_part)
+                    
+                    # 转换为秒
+                    total_seconds = (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000)
+                    return total_seconds
+                    
+                except ValueError as e:
+                    logger.warning(f"时间格式解析错误: {time_str}, error: {e}")
+                    return 0.0
+            
+            # 计算开始和结束时间的秒数
+            start_seconds = time_to_seconds(start_str)
+            end_seconds = time_to_seconds(end_str)
+            
+            # 计算持续时间(秒)
+            duration = end_seconds - start_seconds
+            
+            # 根据经验公式计算字数: 每0.5秒一个字
+            word_count = int(duration / 0.4)
+            
+            # 确保字数在合理范围内
+            word_count = max(10, min(word_count, 500))  # 限制在10-500字之间
+            
+            logger.debug(f"时间范围 {time_range} 的持续时间为 {duration:.3f}秒, 估算字数: {word_count}")
+            return word_count
+            
+        except Exception as e:
+            logger.warning(f"字数计算错误: {traceback.format_exc()}")
+            return 100  # 发生错误时返回默认字数
+
+    def process_frames(self, frame_content_list: List[Dict]) -> List[Dict]:
+        for frame_content in frame_content_list:
+            word_count = self.calculate_duration_and_word_count(frame_content["timestamp"])
+            script = self.generator.generate_script(frame_content["picture"], word_count)
+            frame_content["narration"] = script
+            frame_content["OST"] = 2
+            logger.info(f"时间范围: {frame_content['timestamp']}, 建议字数: {word_count}")
+            logger.info(script)
+
+        self._save_results(frame_content_list)
+        return frame_content_list
+
+    def _save_results(self, frame_content_list: List[Dict]):
+        """保存处理结果，并添加新的时间戳"""
+        try:
+            def format_timestamp(seconds: float) -> str:
+                """将秒数转换为 HH:MM:SS,mmm 格式"""
+                hours = int(seconds // 3600)
+                minutes = int((seconds % 3600) // 60)
+                seconds_remainder = seconds % 60
+                whole_seconds = int(seconds_remainder)
+                milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                
+                return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+
+            # 计算新的时间戳
+            current_time = 0.0  # 当前时间点（秒，包含毫秒）
+
+            for frame in frame_content_list:
+                # 获取原始时间戳的持续时间
+                start_str, end_str = frame['timestamp'].split('-')
+
+                def time_to_seconds(time_str: str) -> float:
+                    """将时间字符串转换为秒数（包含毫秒）"""
+                    try:
+                        if ',' in time_str:
+                            time_part, ms_part = time_str.split(',')
+                            ms = float(ms_part) / 1000
+                        else:
+                            time_part = time_str
+                            ms = 0
+
+                        parts = time_part.split(':')
+                        if len(parts) == 3:  # HH:MM:SS
+                            h, m, s = map(float, parts)
+                            seconds = h * 3600 + m * 60 + s
+                        elif len(parts) == 2:  # MM:SS
+                            m, s = map(float, parts)
+                            seconds = m * 60 + s
+                        else:  # SS
+                            seconds = float(parts[0])
+
+                        return seconds + ms
+                    except Exception as e:
+                        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+                        return 0.0
+
+                # 计算当前片段的持续时间
+                start_seconds = time_to_seconds(start_str)
+                end_seconds = time_to_seconds(end_str)
+                duration = end_seconds - start_seconds
+
+                # 设置新的时间戳
+                new_start = format_timestamp(current_time)
+                new_end = format_timestamp(current_time + duration)
+                frame['new_timestamp'] = f"{new_start}-{new_end}"
+
+                # 更新当前时间点
+                current_time += duration
+
+            # 保存结果
+            file_name = f"storage/json/step2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+            os.makedirs(os.path.dirname(file_name), exist_ok=True)
+
+            with open(file_name, 'w', encoding='utf-8') as file:
+                json.dump(frame_content_list, file, ensure_ascii=False, indent=4)
+
+            logger.info(f"保存脚本成功，总时长: {format_timestamp(current_time)}")
+
+        except Exception as e:
+            logger.error(f"保存结果时发生错误: {str(e)}\n{traceback.format_exc()}")
+            raise
diff --git a/app/utils/utils.py b/app/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dbf7e3bf85dbfb394e2a81400facf1caa8dc972
--- /dev/null
+++ b/app/utils/utils.py
@@ -0,0 +1,668 @@
+import locale
+import os
+import traceback
+
+import requests
+import threading
+from typing import Any
+from loguru import logger
+import streamlit as st
+import json
+from uuid import uuid4
+import urllib3
+from datetime import datetime, timedelta
+
+from app.models import const
+from app.utils import check_script
+from app.services import material
+
+urllib3.disable_warnings()
+
+
+def get_response(status: int, data: Any = None, message: str = ""):
+    obj = {
+        "status": status,
+    }
+    if data:
+        obj["data"] = data
+    if message:
+        obj["message"] = message
+    return obj
+
+
+def to_json(obj):
+    try:
+        # 定义一个辅助函数来处理不同类型的对象
+        def serialize(o):
+            # 如果对象是可序列化类型，直接返回
+            if isinstance(o, (int, float, bool, str)) or o is None:
+                return o
+            # 如果对象是二进制数据，转换为base64编码的字符串
+            elif isinstance(o, bytes):
+                return "*** binary data ***"
+            # 如果象是字典，递归处理每个键值对
+            elif isinstance(o, dict):
+                return {k: serialize(v) for k, v in o.items()}
+            # 如果对象是列表或元组，递归处理每个元素
+            elif isinstance(o, (list, tuple)):
+                return [serialize(item) for item in o]
+            # 如果对象是自定义类型，尝试返回其__dict__属性
+            elif hasattr(o, "__dict__"):
+                return serialize(o.__dict__)
+            # 其他情况返回None（或者可以选择抛出异常）
+            else:
+                return None
+
+        # 使用serialize函数处理输入对象
+        serialized_obj = serialize(obj)
+
+        # 序列化处理后的对象为JSON符串
+        return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
+    except Exception as e:
+        return None
+
+
+def get_uuid(remove_hyphen: bool = False):
+    u = str(uuid4())
+    if remove_hyphen:
+        u = u.replace("-", "")
+    return u
+
+
+def root_dir():
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+def storage_dir(sub_dir: str = "", create: bool = False):
+    d = os.path.join(root_dir(), "storage")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if create and not os.path.exists(d):
+        os.makedirs(d)
+
+    return d
+
+
+def resource_dir(sub_dir: str = ""):
+    d = os.path.join(root_dir(), "resource")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
+def task_dir(sub_dir: str = ""):
+    d = os.path.join(storage_dir(), "tasks")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def font_dir(sub_dir: str = ""):
+    d = resource_dir("fonts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def song_dir(sub_dir: str = ""):
+    d = resource_dir("songs")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
+    """
+    获取背景音乐文件路径
+    Args:
+        bgm_type: 背景音乐类型，可选值: random(随机), ""(无背景音乐)
+        bgm_file: 指定的背景音乐文件路径
+
+    Returns:
+        str: 背景音乐文件路径
+    """
+    import glob
+    import random
+    if not bgm_type:
+        return ""
+
+    if bgm_file and os.path.exists(bgm_file):
+        return bgm_file
+
+    if bgm_type == "random":
+        song_dir_path = song_dir()
+
+        # 检查目录是否存在
+        if not os.path.exists(song_dir_path):
+            logger.warning(f"背景音乐目录不存在: {song_dir_path}")
+            return ""
+
+        # 支持 mp3 和 flac 格式
+        mp3_files = glob.glob(os.path.join(song_dir_path, "*.mp3"))
+        flac_files = glob.glob(os.path.join(song_dir_path, "*.flac"))
+        files = mp3_files + flac_files
+
+        # 检查是否找到音乐文件
+        if not files:
+            logger.warning(f"在目录 {song_dir_path} 中没有找到 MP3 或 FLAC 文件")
+            return ""
+
+        return random.choice(files)
+
+    return ""
+
+
+def public_dir(sub_dir: str = ""):
+    d = resource_dir(f"public")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def srt_dir(sub_dir: str = ""):
+    d = resource_dir(f"srt")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def run_in_background(func, *args, **kwargs):
+    def run():
+        try:
+            func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"run_in_background error: {e}")
+
+    thread = threading.Thread(target=run)
+    thread.start()
+    return thread
+
+
+def time_convert_seconds_to_hmsm(seconds) -> str:
+    hours = int(seconds // 3600)
+    seconds = seconds % 3600
+    minutes = int(seconds // 60)
+    milliseconds = int(seconds * 1000) % 1000
+    seconds = int(seconds % 60)
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)
+
+
+def format_time(seconds: float) -> str:
+    """
+    将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm)
+    
+    参数:
+        seconds: 需要转换的秒数，可以是整数或浮点数
+        
+    返回:
+        格式化的时间字符串，格式为 HH:MM:SS,mmm
+    """
+    # 计算小时、分钟、秒和毫秒
+    hours = int(seconds // 3600)
+    remaining_seconds = seconds % 3600
+    minutes = int(remaining_seconds // 60)
+    remaining_seconds = remaining_seconds % 60
+    secs = int(remaining_seconds)
+    milliseconds = int((remaining_seconds - secs) * 1000)
+    
+    # 格式化为时间字符串
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds)
+
+
+def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
+    start_time = time_convert_seconds_to_hmsm(start_time)
+    end_time = time_convert_seconds_to_hmsm(end_time)
+    srt = """%d
+%s --> %s
+%s
+        """ % (
+        idx,
+        start_time,
+        end_time,
+        msg,
+    )
+    return srt
+
+
+def str_contains_punctuation(word):
+    for p in const.PUNCTUATIONS:
+        if p in word:
+            return True
+    return False
+
+
+def split_string_by_punctuations(s):
+    result = []
+    txt = ""
+
+    previous_char = ""
+    next_char = ""
+    for i in range(len(s)):
+        char = s[i]
+        if char == "\n":
+            result.append(txt.strip())
+            txt = ""
+            continue
+
+        if i > 0:
+            previous_char = s[i - 1]
+        if i < len(s) - 1:
+            next_char = s[i + 1]
+
+        if char == "." and previous_char.isdigit() and next_char.isdigit():
+            # 取现1万，按2.5%收取手续费, 2.5 中的 . 不能作为换行标记
+            txt += char
+            continue
+
+        if char not in const.PUNCTUATIONS:
+            txt += char
+        else:
+            result.append(txt.strip())
+            txt = ""
+    result.append(txt.strip())
+    # filter empty string
+    result = list(filter(None, result))
+    return result
+
+
+def md5(text):
+    import hashlib
+
+    return hashlib.md5(text.encode("utf-8")).hexdigest()
+
+
+def get_system_locale():
+    try:
+        loc = locale.getdefaultlocale()
+        # zh_CN, zh_TW return zh
+        # en_US, en_GB return en
+        language_code = loc[0].split("_")[0]
+        return language_code
+    except Exception as e:
+        return "en"
+
+
+def load_locales(i18n_dir):
+    _locales = {}
+    for root, dirs, files in os.walk(i18n_dir):
+        for file in files:
+            if file.endswith(".json"):
+                lang = file.split(".")[0]
+                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
+                    _locales[lang] = json.loads(f.read())
+    return _locales
+
+
+def parse_extension(filename):
+    return os.path.splitext(filename)[1].strip().lower().replace(".", "")
+
+
+def script_dir(sub_dir: str = ""):
+    d = resource_dir(f"scripts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def video_dir(sub_dir: str = ""):
+    d = resource_dir(f"videos")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def subtitle_dir(sub_dir: str = ""):
+    d = resource_dir(f"srt")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def split_timestamp(timestamp):
+    """
+    拆分时间戳
+    """
+    start, end = timestamp.split('-')
+    start_hour, start_minute = map(int, start.split(':'))
+    end_hour, end_minute = map(int, end.split(':'))
+
+    start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute)
+    end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute)
+
+    return start_time, end_time
+
+
+def reduce_video_time(txt: str, duration: float = 0.21531):
+    """
+    按照字数缩减视频时长，一个字耗时约 0.21531 s,
+    Returns:
+    """
+    # 返回结果四舍五入为整数
+    duration = len(txt) * duration
+    return int(duration)
+
+
+def get_current_country():
+    """
+    判断当前网络IP地址所在的国家
+    """
+    try:
+        # 使用ipapi.co的免费API获取IP地址信息
+        response = requests.get('https://ipapi.co/json/')
+        data = response.json()
+
+        # 获取国家名称
+        country = data.get('country_name')
+
+        if country:
+            logger.debug(f"当前网络IP地址位于：{country}")
+            return country
+        else:
+            logger.debug("无法确定当前网络IP地址所在的国家")
+            return None
+
+    except requests.RequestException:
+        logger.error("获取IP地址信息时发生错误，请检查网络连接")
+        return None
+
+
+def time_to_seconds(time_str: str) -> float:
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒
+    - "MM:SS,mmm" -> 分钟:秒,毫秒
+    - "SS,mmm" -> 秒,毫秒
+    - "SS-mmm" -> 秒-毫秒
+    
+    Args:
+        time_str: 时间字符串
+        
+    Returns:
+        float: 转换后的秒数(包含毫秒)
+    """
+    try:
+        # 处理带有'-'的毫秒格式
+        if '-' in time_str:
+            time_part, ms_part = time_str.split('-')
+            ms = float(ms_part) / 1000
+        # 处理带有','的毫秒格式
+        elif ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(float, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(float, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = float(parts[0])
+
+        return seconds + ms
+
+    except (ValueError, IndexError) as e:
+        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+        return 0.0
+
+
+def seconds_to_time(seconds: float) -> str:
+    h, remainder = divmod(seconds, 3600)
+    m, s = divmod(remainder, 60)
+    return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"
+
+
+def calculate_total_duration(scenes):
+    """
+    计算场景列表的总时长
+    
+    Args:
+        scenes: 场景列表，每个场景包含 timestamp 字段，格式如 "00:00:28,350-00:00:41,000"
+        
+    Returns:
+        float: 总时长（秒）
+    """
+    total_seconds = 0
+
+    for scene in scenes:
+        start, end = scene['timestamp'].split('-')
+        # 使用 time_to_seconds 函数处理更精确的时间格式
+        start_seconds = time_to_seconds(start)
+        end_seconds = time_to_seconds(end)
+
+        duration = end_seconds - start_seconds
+        total_seconds += duration
+
+    return total_seconds
+
+
+def add_new_timestamps(scenes):
+    """
+    新增新视频的时间戳，并为"原生播放"的narration添加唯一标识符
+    Args:
+        scenes: 场景列表
+
+    Returns:
+        更新后的场景列表
+    """
+    current_time = timedelta()
+    updated_scenes = []
+
+    # 保存脚本前先检查脚本是否正确
+    check_script.check_script(scenes, calculate_total_duration(scenes))
+
+    for scene in scenes:
+        new_scene = scene.copy()  # 创建场景的副本，以保留原始数据
+        start, end = new_scene['timestamp'].split('-')
+        start_time = datetime.strptime(start, '%M:%S')
+        end_time = datetime.strptime(end, '%M:%S')
+        duration = end_time - start_time
+
+        new_start = current_time
+        current_time += duration
+        new_end = current_time
+
+        # 将 timedelta 转换为分钟和秒
+        new_start_str = f"{int(new_start.total_seconds() // 60):02d}:{int(new_start.total_seconds() % 60):02d}"
+        new_end_str = f"{int(new_end.total_seconds() // 60):02d}:{int(new_end.total_seconds() % 60):02d}"
+
+        new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}"
+
+        # 为"原生播放"的narration添加唯一标识符
+        if new_scene.get('narration') == "" or new_scene.get('narration') == None:
+            unique_id = str(uuid4())[:8]  # 使用UUID的前8个字符作为唯一标识符
+            new_scene['narration'] = f"原声播放_{unique_id}"
+
+        updated_scenes.append(new_scene)
+
+    return updated_scenes
+
+
+def clean_model_output(output):
+    # 移除可能的代码块标记
+    output = output.strip('```json').strip('```')
+    # 移除开头和结尾的空白字符
+    output = output.strip()
+    return output
+
+
+def cut_video(params, progress_callback=None):
+    try:
+        task_id = str(uuid4())
+        st.session_state['task_id'] = task_id
+
+        if not st.session_state.get('video_clip_json'):
+            raise ValueError("视频脚本不能为空")
+
+        video_script_list = st.session_state['video_clip_json']
+        time_list = [i['timestamp'] for i in video_script_list]
+
+        def clip_progress(current, total):
+            progress = int((current / total) * 100)
+            if progress_callback:
+                progress_callback(progress)
+
+        subclip_videos = material.clip_videos(
+            task_id=task_id,
+            timestamp_terms=time_list,
+            origin_video=params.video_origin_path,
+            progress_callback=clip_progress
+        )
+
+        if subclip_videos is None:
+            raise ValueError("裁剪视频失败")
+
+        st.session_state['subclip_videos'] = subclip_videos
+        for i, video_script in enumerate(video_script_list):
+            try:
+                video_script['path'] = subclip_videos[i+1]
+            except KeyError as err:
+                logger.error(f"裁剪视频失败: {err}")
+
+        return task_id, subclip_videos
+
+    except Exception as e:
+        logger.error(f"视频裁剪过程中发生错误: \n{traceback.format_exc()}")
+        raise
+
+
+def temp_dir(sub_dir: str = ""):
+    """
+    获取临时文件目录
+    Args:
+        sub_dir: 子目录名
+    Returns:
+        str: 临时文件目录路径
+    """
+    d = os.path.join(storage_dir(), "temp")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def clear_keyframes_cache(video_path: str = None):
+    """
+    清理关键帧缓存
+    Args:
+        video_path: 视频文件路径，如果指定则只清理该视频的缓存
+    """
+    try:
+        keyframes_dir = os.path.join(temp_dir(), "keyframes")
+        if not os.path.exists(keyframes_dir):
+            return
+
+        if video_path:
+            # 理指定视频的缓存
+            video_hash = md5(video_path + str(os.path.getmtime(video_path)))
+            video_keyframes_dir = os.path.join(keyframes_dir, video_hash)
+            if os.path.exists(video_keyframes_dir):
+                import shutil
+                shutil.rmtree(video_keyframes_dir)
+                logger.info(f"已清理视频关键帧缓存: {video_path}")
+        else:
+            # 清理所有缓存
+            import shutil
+            shutil.rmtree(keyframes_dir)
+            logger.info("已清理所有关键帧缓存")
+
+    except Exception as e:
+        logger.error(f"清理关键帧缓存失败: {e}")
+
+
+def init_resources():
+    """初始化资源文件"""
+    try:
+        # 创建字体目录
+        font_dir = os.path.join(root_dir(), "resource", "fonts")
+        os.makedirs(font_dir, exist_ok=True)
+
+        # 检查字体文件
+        font_files = [
+            ("SourceHanSansCN-Regular.otf",
+             "https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"),
+            ("simhei.ttf", "C:/Windows/Fonts/simhei.ttf"),  # Windows 黑体
+            ("simkai.ttf", "C:/Windows/Fonts/simkai.ttf"),  # Windows 楷体
+            ("simsun.ttc", "C:/Windows/Fonts/simsun.ttc"),  # Windows 宋体
+        ]
+
+        # 优先使用系统字体
+        system_font_found = False
+        for font_name, source in font_files:
+            if not source.startswith("http") and os.path.exists(source):
+                target_path = os.path.join(font_dir, font_name)
+                if not os.path.exists(target_path):
+                    import shutil
+                    shutil.copy2(source, target_path)
+                    logger.info(f"已复制系统字体: {font_name}")
+                system_font_found = True
+                break
+
+        # 如果没有找到系统字体，则下载思源黑体
+        if not system_font_found:
+            source_han_path = os.path.join(font_dir, "SourceHanSansCN-Regular.otf")
+            if not os.path.exists(source_han_path):
+                download_font(font_files[0][1], source_han_path)
+
+    except Exception as e:
+        logger.error(f"初始化资源文件失败: {e}")
+
+
+def download_font(url: str, font_path: str):
+    """下载字体文件"""
+    try:
+        logger.info(f"正在下载字体文件: {url}")
+        import requests
+        response = requests.get(url)
+        response.raise_for_status()
+
+        with open(font_path, 'wb') as f:
+            f.write(response.content)
+
+        logger.info(f"字体文件下载成功: {font_path}")
+
+    except Exception as e:
+        logger.error(f"下载字体文件失败: {e}")
+        raise
+
+
+def init_imagemagick():
+    """初始化 ImageMagick 配置"""
+    try:
+        # 检查 ImageMagick 是否已安装
+        import subprocess
+        result = subprocess.run(['magick', '-version'], capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error("ImageMagick 未安装或配置不正确")
+            return False
+
+        # 设置 IMAGEMAGICK_BINARY 环境变量
+        os.environ['IMAGEMAGICK_BINARY'] = 'magick'
+
+        return True
+    except Exception as e:
+        logger.error(f"初始化 ImageMagick 失败: {str(e)}")
+        return False
diff --git a/app/utils/video_processor.py b/app/utils/video_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc3ab8e2c3b9bcc91fd7005f393077810cd12e4
--- /dev/null
+++ b/app/utils/video_processor.py
@@ -0,0 +1,225 @@
+"""
+视频帧提取工具
+
+这个模块提供了简单高效的视频帧提取功能。主要特点：
+1. 使用ffmpeg进行视频处理，支持硬件加速
+2. 按指定时间间隔提取视频关键帧
+3. 支持多种视频格式
+4. 支持高清视频帧输出
+5. 直接从原视频提取高质量关键帧
+
+不依赖OpenCV和sklearn等库，只使用ffmpeg作为外部依赖，降低了安装和使用的复杂度。
+"""
+
+import os
+import re
+import time
+import subprocess
+from typing import List, Dict
+from loguru import logger
+from tqdm import tqdm
+
+from app.utils import ffmpeg_utils
+
+
+class VideoProcessor:
+    def __init__(self, video_path: str):
+        """
+        初始化视频处理器
+
+        Args:
+            video_path: 视频文件路径
+        """
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"视频文件不存在: {video_path}")
+
+        self.video_path = video_path
+        self.video_info = self._get_video_info()
+        self.fps = float(self.video_info.get('fps', 25))
+        self.duration = float(self.video_info.get('duration', 0))
+        self.width = int(self.video_info.get('width', 0))
+        self.height = int(self.video_info.get('height', 0))
+        self.total_frames = int(self.fps * self.duration)
+
+    def _get_video_info(self) -> Dict[str, str]:
+        """
+        使用ffprobe获取视频信息
+
+        Returns:
+            Dict[str, str]: 包含视频基本信息的字典
+        """
+        cmd = [
+            "ffprobe",
+            "-v", "error",
+            "-select_streams", "v:0",
+            "-show_entries", "stream=width,height,r_frame_rate,duration",
+            "-of", "default=noprint_wrappers=1:nokey=0",
+            self.video_path
+        ]
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            lines = result.stdout.strip().split('\n')
+            info = {}
+            for line in lines:
+                if '=' in line:
+                    key, value = line.split('=', 1)
+                    info[key] = value
+
+            # 处理帧率（可能是分数形式）
+            if 'r_frame_rate' in info:
+                try:
+                    num, den = map(int, info['r_frame_rate'].split('/'))
+                    info['fps'] = str(num / den)
+                except ValueError:
+                    info['fps'] = info.get('r_frame_rate', '25')
+
+            return info
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频信息失败: {e.stderr}")
+            return {
+                'width': '1280',
+                'height': '720',
+                'fps': '25',
+                'duration': '0'
+            }
+
+    def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0,
+                                  use_hw_accel: bool = True) -> List[int]:
+        """
+        按指定时间间隔提取视频帧
+
+        Args:
+            output_dir: 输出目录
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
+
+        Returns:
+            List[int]: 提取的帧号列表
+        """
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        # 计算起始时间和帧提取点
+        start_time = 0
+        end_time = self.duration
+        extraction_times = []
+
+        current_time = start_time
+        while current_time < end_time:
+            extraction_times.append(current_time)
+            current_time += interval_seconds
+
+        if not extraction_times:
+            logger.warning("未找到需要提取的帧")
+            return []
+
+        # 确定硬件加速器选项
+        hw_accel = []
+        if use_hw_accel and ffmpeg_utils.is_ffmpeg_hwaccel_available():
+            hw_accel = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+        # 提取帧
+        frame_numbers = []
+        for i, timestamp in enumerate(tqdm(extraction_times, desc="提取视频帧")):
+            frame_number = int(timestamp * self.fps)
+            frame_numbers.append(frame_number)
+
+            # 格式化时间戳字符串 (HHMMSSmmm)
+            hours = int(timestamp // 3600)
+            minutes = int((timestamp % 3600) // 60)
+            seconds = int(timestamp % 60)
+            milliseconds = int((timestamp % 1) * 1000)
+            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
+
+            output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg")
+
+            # 使用ffmpeg提取单帧
+            cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+            ]
+
+            # 添加硬件加速参数
+            cmd.extend(hw_accel)
+
+            cmd.extend([
+                "-ss", str(timestamp),
+                "-i", self.video_path,
+                "-vframes", "1",
+                "-q:v", "1",  # 最高质量
+                "-y",
+                output_path
+            ])
+
+            try:
+                subprocess.run(cmd, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                logger.warning(f"提取帧 {frame_number} 失败: {e.stderr}")
+
+        logger.info(f"成功提取了 {len(frame_numbers)} 个视频帧")
+        return frame_numbers
+
+    def _detect_hw_accelerator(self) -> List[str]:
+        """
+        检测系统可用的硬件加速器
+
+        Returns:
+            List[str]: 硬件加速器ffmpeg命令参数
+        """
+        # 使用集中式硬件加速检测
+        if ffmpeg_utils.is_ffmpeg_hwaccel_available():
+            return ffmpeg_utils.get_ffmpeg_hwaccel_args()
+        return []
+
+    def process_video_pipeline(self,
+                              output_dir: str,
+                              interval_seconds: float = 5.0,  # 帧提取间隔（秒）
+                              use_hw_accel: bool = True) -> None:
+        """
+        执行简化的视频处理流程，直接从原视频按固定时间间隔提取帧
+
+        Args:
+            output_dir: 输出目录
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
+        """
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+
+        try:
+            # 直接从原视频提取关键帧
+            logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...")
+            self.extract_frames_by_interval(
+                output_dir,
+                interval_seconds=interval_seconds,
+                use_hw_accel=use_hw_accel
+            )
+
+            logger.info(f"处理完成！视频帧已保存在: {output_dir}")
+
+        except Exception as e:
+            import traceback
+            logger.error(f"视频处理失败: \n{traceback.format_exc()}")
+            raise
+
+
+if __name__ == "__main__":
+    import time
+
+    start_time = time.time()
+
+    # 使用示例
+    processor = VideoProcessor("./resource/videos/test.mp4")
+
+    # 设置间隔为3秒提取帧
+    processor.process_video_pipeline(
+        output_dir="output",
+        interval_seconds=3.0,
+        use_hw_accel=True
+    )
+
+    end_time = time.time()
+    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
diff --git a/changelog.py b/changelog.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a1337a488dc04c44c6ac2da3153a568ef227bb
--- /dev/null
+++ b/changelog.py
@@ -0,0 +1,17 @@
+from git_changelog.cli import build_and_render
+
+# 运行这段脚本自动生成CHANGELOG.md文件
+
+build_and_render(
+    repository=".",
+    output="CHANGELOG.md",
+    convention="angular",
+    provider="github",
+    template="keepachangelog",
+    parse_trailers=True,
+    parse_refs=False,
+    sections=["build", "deps", "feat", "fix", "refactor"],
+    versioning="pep440",
+    bump="1.1.2",  # 指定bump版本
+    in_place=True,
+)
diff --git a/config.example.toml b/config.example.toml
new file mode 100644
index 0000000000000000000000000000000000000000..adafb84f72e553294b90bd30bf6506de168dba89
--- /dev/null
+++ b/config.example.toml
@@ -0,0 +1,89 @@
+[app]
+    project_version="0.6.2"
+    # 支持视频理解的大模型提供商
+    #   gemini  (谷歌, 需要 VPN)
+    #   siliconflow (硅基流动)
+    #   qwenvl  (通义千问)
+    vision_llm_provider="Siliconflow"
+
+    ########## Gemini 视觉模型
+    vision_gemini_api_key = ""
+    vision_gemini_model_name = "gemini-2.0-flash-lite"
+
+    ########## QwenVL 视觉模型
+    vision_qwenvl_api_key = ""
+    vision_qwenvl_model_name = "qwen2.5-vl-32b-instruct"
+    vision_qwenvl_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
+    ########## siliconflow 视觉模型
+    vision_siliconflow_api_key = ""
+    vision_siliconflow_model_name = "Qwen/Qwen2.5-VL-32B-Instruct"
+    vision_siliconflow_base_url = "https://api.siliconflow.cn/v1"
+
+    ########## OpenAI 视觉模型
+    vision_openai_api_key = ""
+    vision_openai_model_name = "gpt-4.1-nano-2025-04-14"
+    vision_openai_base_url = "https://api.openai.com/v1"
+
+    ########### NarratoAPI 微调模型 (未发布)
+    narrato_api_key = ""
+    narrato_api_url = ""
+    narrato_model = "narra-1.0-2025-05-09"
+
+    # 用于生成文案的大模型支持的提供商 (Supported providers):
+    #   openai (默认, 需要 VPN)
+    #   siliconflow (硅基流动)
+    #   deepseek (深度求索)
+    #   gemini (谷歌, 需要 VPN)
+    #   qwen (通义千问)
+    #   moonshot (月之暗面)
+    text_llm_provider="openai"
+
+    ########## OpenAI API Key
+    # Get your API key at https://platform.openai.com/api-keys
+    text_openai_api_key = ""
+    text_openai_base_url = "https://api.openai.com/v1"
+    text_openai_model_name = "gpt-4.1-mini-2025-04-14"
+
+    # 使用 硅基流动 第三方 API Key，使用手机号注册：https://cloud.siliconflow.cn/i/pyOKqFCV
+    # 访问 https://cloud.siliconflow.cn/account/ak 获取你的 API 密钥
+    text_siliconflow_api_key = ""
+    text_siliconflow_base_url = "https://api.siliconflow.cn/v1"
+    text_siliconflow_model_name = "deepseek-ai/DeepSeek-R1"
+
+    ########## DeepSeek API Key
+    # 访问 https://platform.deepseek.com/api_keys 获取你的 API 密钥
+    text_deepseek_api_key = ""
+    text_deepseek_base_url = "https://api.deepseek.com"
+    text_deepseek_model_name = "deepseek-chat"
+
+    ########## Gemini API Key
+    text_gemini_api_key=""
+    text_gemini_model_name = "gemini-2.0-flash"
+    text_gemini_base_url = "https://generativelanguage.googleapis.com/v1beta/openai"
+
+    ########## Qwen API Key
+    # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥
+    text_qwen_api_key = ""
+    text_qwen_model_name = "qwen-plus-1127"
+    text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
+    ########## Moonshot API Key
+    # 访问 https://platform.moonshot.cn/console/api-keys 获取你的 API 密钥
+    text_moonshot_api_key=""
+    text_moonshot_base_url = "https://api.moonshot.cn/v1"
+    text_moonshot_model_name = "moonshot-v1-8k"
+
+    # webui界面是否显示配置项
+    hide_config = true
+
+[proxy]
+    http = "http://127.0.0.1:7890"
+    https = "http://127.0.0.1:7890"
+    enabled = false
+
+[frames]
+    # 提取关键帧的间隔时间
+    frame_interval_input = 3
+    # 大模型单次处理的关键帧数量
+    vision_batch_size = 10
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8b36f7c6206982dcbaca297d92d53d7aad0963ee
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,38 @@
+x-common: &common
+  build:
+    context: .
+    dockerfile: Dockerfile
+  image: linyq1/narratoai:latest
+  volumes:
+    - ./:/NarratoAI
+  environment:
+    - VPN_PROXY_URL=http://host.docker.internal:7890
+    - PYTHONUNBUFFERED=1
+    - PYTHONMALLOC=malloc
+    - OPENCV_OPENCL_RUNTIME=disabled
+    - OPENCV_CPU_DISABLE=0
+  restart: always
+  mem_limit: 4g
+  mem_reservation: 2g
+  memswap_limit: 6g
+  cpus: 2.0
+  cpu_shares: 1024
+
+services:
+  webui:
+    <<: *common
+    container_name: webui
+    ports:
+      - "8501:8501"
+    command: ["webui"]
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "200m"
+        max-file: "3"
+    tmpfs:
+      - /tmp:size=1G
+    ulimits:
+      nofile:
+        soft: 65536
+        hard: 65536
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c4ca7d3fbc75cebbbdc66922c79b55cf41ede5e0
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+if [ "$1" = "webui" ]; then
+    exec streamlit run webui.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+else
+    exec "$@"
+fi 
\ No newline at end of file
diff --git a/docker/Dockerfile_MiniCPM b/docker/Dockerfile_MiniCPM
new file mode 100644
index 0000000000000000000000000000000000000000..71ab7f958f4943ede3f5ec40e613380bc97a15d2
--- /dev/null
+++ b/docker/Dockerfile_MiniCPM
@@ -0,0 +1,31 @@
+ARG BASE=nvidia/cuda:12.1.0-devel-ubuntu22.04
+FROM ${BASE}
+
+# 设置环境变量
+ENV http_proxy=http://host.docker.internal:7890
+ENV https_proxy=http://host.docker.internal:7890
+ENV DEBIAN_FRONTEND=noninteractive
+
+# 安装系统依赖
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc g++ make git python3 python3-dev python3-pip python3-venv python3-wheel \
+    espeak-ng libsndfile1-dev nano vim unzip wget xz-utils && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# 设置工作目录
+WORKDIR /root/MiniCPM-V/
+
+# 安装 Python 依赖
+RUN git clone https://github.com/OpenBMB/MiniCPM-V.git && \
+    cd MiniCPM-V && \
+    pip3 install decord && \
+    pip3 install --no-cache-dir -r requirements.txt && \
+    pip3 install flash_attn
+
+# 清理代理环境变量
+ENV http_proxy=""
+ENV https_proxy=""
+
+# 设置 PYTHONPATH
+ENV PYTHONPATH="/root/MiniCPM-V/"
diff --git a/docs/check-en.png b/docs/check-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..9e346154e22051a49b33620b80d3a44479899b02
--- /dev/null
+++ b/docs/check-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9f85d12ff800d4ed40e695c7c8ab5044367c3842354369a54b3ad9f34b40bc6
+size 945274
diff --git a/docs/check-zh.png b/docs/check-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d3141680df0bc85ede90d21b6af76b6f2bbf60e
--- /dev/null
+++ b/docs/check-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f70e3a51eb6695fc34ff0467410c10e3fc9b6201a5d36b8fe7231899708951d
+size 1411357
diff --git a/docs/img001-en.png b/docs/img001-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a0eb6f20b4b50a0cef0be278873c2dd1fe75023
--- /dev/null
+++ b/docs/img001-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a3143b3232d5b0eb5755f3bf21d1bc9f4aad4ad9c6d73a9397c7b86c1dab411
+size 104440
diff --git a/docs/img001-zh.png b/docs/img001-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc2c898f8383d602dc6a0145428acc13900e47e4
--- /dev/null
+++ b/docs/img001-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88d039b4513e18bf73ef3469b72af84719f637ab76eae8b7d70acd8426a47b4a
+size 111174
diff --git a/docs/img002-en.png b/docs/img002-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c2ac2ad204a8165408e53a44b41d48da43a63
Binary files /dev/null and b/docs/img002-en.png differ
diff --git a/docs/img002-zh.png b/docs/img002-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac31b4718d5deecf73354cea993cb2f5d3dfcf83
Binary files /dev/null and b/docs/img002-zh.png differ
diff --git a/docs/img003-en.png b/docs/img003-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..92e5d498c1289dc734bf3a5eea7400cc7440eac0
Binary files /dev/null and b/docs/img003-en.png differ
diff --git a/docs/img003-zh.png b/docs/img003-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..dd6ee078ca11ce8a14235b5de094dcdd95306b4b
Binary files /dev/null and b/docs/img003-zh.png differ
diff --git a/docs/img004-en.png b/docs/img004-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ad81769e9f6750123614a19eb0de71dbbf69db6
--- /dev/null
+++ b/docs/img004-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cf64ba92f6bde53de07369f9466ca660c08c3e86ff0b4581afe3a988e72d757
+size 938179
diff --git a/docs/img004-zh.png b/docs/img004-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5fdb69cf9acdc3966b20f2ee7e67478233b91f4
--- /dev/null
+++ b/docs/img004-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c1b5007482d8b27b01d5751c78fac2bda177523b4fd3535cedf444c895f49a9
+size 712178
diff --git a/docs/img005-en.png b/docs/img005-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..5de935624b94e109a555547863fa15d6510e0476
Binary files /dev/null and b/docs/img005-en.png differ
diff --git a/docs/img005-zh.png b/docs/img005-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..35457623e07f7519d660c1257dc0163e8e0d8420
--- /dev/null
+++ b/docs/img005-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:793f6cda027908bef5d9c56999ecb42ed192939186239b195a9abd5c7cc4640e
+size 147544
diff --git a/docs/img006-en.png b/docs/img006-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd1d61fd7366c27206d27603c90cd2859909f881
--- /dev/null
+++ b/docs/img006-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2aac04bf33d7b44513b54d237e5c6d00465de36863e9d77b9c5f0d329632a6a3
+size 170331
diff --git a/docs/img006-zh.png b/docs/img006-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..d30ad7796281ea7f06821fcab6a1d47fac8f4c3e
--- /dev/null
+++ b/docs/img006-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a67e22f4faa34714cf2799696c14d5f6d31122332040da89416b763f764bee4
+size 146062
diff --git a/docs/img007-en.png b/docs/img007-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c285535f0a52a70510418cf4bf2ef11f06a0840
--- /dev/null
+++ b/docs/img007-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee0da12bee434fd59ec3cb99ba9f9dec378cc577234a40f9afacf9208ddb53f6
+size 479046
diff --git a/docs/img007-zh.png b/docs/img007-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..719785b2b8aa4a5b7feeb3f2642b31dc19944ef4
--- /dev/null
+++ b/docs/img007-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:591a0cdf922cbb4ae985b8b6b28875184dd2a47f924e65c145dbdf43d2836ddf
+size 307049
diff --git a/docs/index-en.png b/docs/index-en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ce56ea82caa55e036c96d7548f90db6357505a5
--- /dev/null
+++ b/docs/index-en.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a3e7476f5d036f652c04758339614432c98f5ec4b6df319c2c0e2143aea3503
+size 234637
diff --git a/docs/index-zh.png b/docs/index-zh.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a28da11c0e409809b8976100f247339702c0b97
--- /dev/null
+++ b/docs/index-zh.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bd3f7f3e4391e02bc1aee789e8347e83ab554b65ef05d8f9c429f6107688ed5
+size 234111
diff --git a/docs/voice-list.txt b/docs/voice-list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4672117c65afe2b5e92cb18caf503aadf1df561d
--- /dev/null
+++ b/docs/voice-list.txt
@@ -0,0 +1,941 @@
+Name: af-ZA-AdriNeural
+Gender: Female
+
+Name: af-ZA-WillemNeural
+Gender: Male
+
+Name: am-ET-AmehaNeural
+Gender: Male
+
+Name: am-ET-MekdesNeural
+Gender: Female
+
+Name: ar-AE-FatimaNeural
+Gender: Female
+
+Name: ar-AE-HamdanNeural
+Gender: Male
+
+Name: ar-BH-AliNeural
+Gender: Male
+
+Name: ar-BH-LailaNeural
+Gender: Female
+
+Name: ar-DZ-AminaNeural
+Gender: Female
+
+Name: ar-DZ-IsmaelNeural
+Gender: Male
+
+Name: ar-EG-SalmaNeural
+Gender: Female
+
+Name: ar-EG-ShakirNeural
+Gender: Male
+
+Name: ar-IQ-BasselNeural
+Gender: Male
+
+Name: ar-IQ-RanaNeural
+Gender: Female
+
+Name: ar-JO-SanaNeural
+Gender: Female
+
+Name: ar-JO-TaimNeural
+Gender: Male
+
+Name: ar-KW-FahedNeural
+Gender: Male
+
+Name: ar-KW-NouraNeural
+Gender: Female
+
+Name: ar-LB-LaylaNeural
+Gender: Female
+
+Name: ar-LB-RamiNeural
+Gender: Male
+
+Name: ar-LY-ImanNeural
+Gender: Female
+
+Name: ar-LY-OmarNeural
+Gender: Male
+
+Name: ar-MA-JamalNeural
+Gender: Male
+
+Name: ar-MA-MounaNeural
+Gender: Female
+
+Name: ar-OM-AbdullahNeural
+Gender: Male
+
+Name: ar-OM-AyshaNeural
+Gender: Female
+
+Name: ar-QA-AmalNeural
+Gender: Female
+
+Name: ar-QA-MoazNeural
+Gender: Male
+
+Name: ar-SA-HamedNeural
+Gender: Male
+
+Name: ar-SA-ZariyahNeural
+Gender: Female
+
+Name: ar-SY-AmanyNeural
+Gender: Female
+
+Name: ar-SY-LaithNeural
+Gender: Male
+
+Name: ar-TN-HediNeural
+Gender: Male
+
+Name: ar-TN-ReemNeural
+Gender: Female
+
+Name: ar-YE-MaryamNeural
+Gender: Female
+
+Name: ar-YE-SalehNeural
+Gender: Male
+
+Name: az-AZ-BabekNeural
+Gender: Male
+
+Name: az-AZ-BanuNeural
+Gender: Female
+
+Name: bg-BG-BorislavNeural
+Gender: Male
+
+Name: bg-BG-KalinaNeural
+Gender: Female
+
+Name: bn-BD-NabanitaNeural
+Gender: Female
+
+Name: bn-BD-PradeepNeural
+Gender: Male
+
+Name: bn-IN-BashkarNeural
+Gender: Male
+
+Name: bn-IN-TanishaaNeural
+Gender: Female
+
+Name: bs-BA-GoranNeural
+Gender: Male
+
+Name: bs-BA-VesnaNeural
+Gender: Female
+
+Name: ca-ES-EnricNeural
+Gender: Male
+
+Name: ca-ES-JoanaNeural
+Gender: Female
+
+Name: cs-CZ-AntoninNeural
+Gender: Male
+
+Name: cs-CZ-VlastaNeural
+Gender: Female
+
+Name: cy-GB-AledNeural
+Gender: Male
+
+Name: cy-GB-NiaNeural
+Gender: Female
+
+Name: da-DK-ChristelNeural
+Gender: Female
+
+Name: da-DK-JeppeNeural
+Gender: Male
+
+Name: de-AT-IngridNeural
+Gender: Female
+
+Name: de-AT-JonasNeural
+Gender: Male
+
+Name: de-CH-JanNeural
+Gender: Male
+
+Name: de-CH-LeniNeural
+Gender: Female
+
+Name: de-DE-AmalaNeural
+Gender: Female
+
+Name: de-DE-ConradNeural
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural
+Gender: Male
+
+Name: de-DE-KatjaNeural
+Gender: Female
+
+Name: de-DE-KillianNeural
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural
+Gender: Female
+
+Name: el-GR-AthinaNeural
+Gender: Female
+
+Name: el-GR-NestorasNeural
+Gender: Male
+
+Name: en-AU-NatashaNeural
+Gender: Female
+
+Name: en-AU-WilliamNeural
+Gender: Male
+
+Name: en-CA-ClaraNeural
+Gender: Female
+
+Name: en-CA-LiamNeural
+Gender: Male
+
+Name: en-GB-LibbyNeural
+Gender: Female
+
+Name: en-GB-MaisieNeural
+Gender: Female
+
+Name: en-GB-RyanNeural
+Gender: Male
+
+Name: en-GB-SoniaNeural
+Gender: Female
+
+Name: en-GB-ThomasNeural
+Gender: Male
+
+Name: en-HK-SamNeural
+Gender: Male
+
+Name: en-HK-YanNeural
+Gender: Female
+
+Name: en-IE-ConnorNeural
+Gender: Male
+
+Name: en-IE-EmilyNeural
+Gender: Female
+
+Name: en-IN-NeerjaExpressiveNeural
+Gender: Female
+
+Name: en-IN-NeerjaNeural
+Gender: Female
+
+Name: en-IN-PrabhatNeural
+Gender: Male
+
+Name: en-KE-AsiliaNeural
+Gender: Female
+
+Name: en-KE-ChilembaNeural
+Gender: Male
+
+Name: en-NG-AbeoNeural
+Gender: Male
+
+Name: en-NG-EzinneNeural
+Gender: Female
+
+Name: en-NZ-MitchellNeural
+Gender: Male
+
+Name: en-NZ-MollyNeural
+Gender: Female
+
+Name: en-PH-JamesNeural
+Gender: Male
+
+Name: en-PH-RosaNeural
+Gender: Female
+
+Name: en-SG-LunaNeural
+Gender: Female
+
+Name: en-SG-WayneNeural
+Gender: Male
+
+Name: en-TZ-ElimuNeural
+Gender: Male
+
+Name: en-TZ-ImaniNeural
+Gender: Female
+
+Name: en-US-AnaNeural
+Gender: Female
+
+Name: en-US-AndrewNeural
+Gender: Male
+
+Name: en-US-AriaNeural
+Gender: Female
+
+Name: en-US-AvaNeural
+Gender: Female
+
+Name: en-US-BrianNeural
+Gender: Male
+
+Name: en-US-ChristopherNeural
+Gender: Male
+
+Name: en-US-EmmaNeural
+Gender: Female
+
+Name: en-US-EricNeural
+Gender: Male
+
+Name: en-US-GuyNeural
+Gender: Male
+
+Name: en-US-JennyNeural
+Gender: Female
+
+Name: en-US-MichelleNeural
+Gender: Female
+
+Name: en-US-RogerNeural
+Gender: Male
+
+Name: en-US-SteffanNeural
+Gender: Male
+
+Name: en-ZA-LeahNeural
+Gender: Female
+
+Name: en-ZA-LukeNeural
+Gender: Male
+
+Name: es-AR-ElenaNeural
+Gender: Female
+
+Name: es-AR-TomasNeural
+Gender: Male
+
+Name: es-BO-MarceloNeural
+Gender: Male
+
+Name: es-BO-SofiaNeural
+Gender: Female
+
+Name: es-CL-CatalinaNeural
+Gender: Female
+
+Name: es-CL-LorenzoNeural
+Gender: Male
+
+Name: es-CO-GonzaloNeural
+Gender: Male
+
+Name: es-CO-SalomeNeural
+Gender: Female
+
+Name: es-CR-JuanNeural
+Gender: Male
+
+Name: es-CR-MariaNeural
+Gender: Female
+
+Name: es-CU-BelkysNeural
+Gender: Female
+
+Name: es-CU-ManuelNeural
+Gender: Male
+
+Name: es-DO-EmilioNeural
+Gender: Male
+
+Name: es-DO-RamonaNeural
+Gender: Female
+
+Name: es-EC-AndreaNeural
+Gender: Female
+
+Name: es-EC-LuisNeural
+Gender: Male
+
+Name: es-ES-AlvaroNeural
+Gender: Male
+
+Name: es-ES-ElviraNeural
+Gender: Female
+
+Name: es-ES-XimenaNeural
+Gender: Female
+
+Name: es-GQ-JavierNeural
+Gender: Male
+
+Name: es-GQ-TeresaNeural
+Gender: Female
+
+Name: es-GT-AndresNeural
+Gender: Male
+
+Name: es-GT-MartaNeural
+Gender: Female
+
+Name: es-HN-CarlosNeural
+Gender: Male
+
+Name: es-HN-KarlaNeural
+Gender: Female
+
+Name: es-MX-DaliaNeural
+Gender: Female
+
+Name: es-MX-JorgeNeural
+Gender: Male
+
+Name: es-NI-FedericoNeural
+Gender: Male
+
+Name: es-NI-YolandaNeural
+Gender: Female
+
+Name: es-PA-MargaritaNeural
+Gender: Female
+
+Name: es-PA-RobertoNeural
+Gender: Male
+
+Name: es-PE-AlexNeural
+Gender: Male
+
+Name: es-PE-CamilaNeural
+Gender: Female
+
+Name: es-PR-KarinaNeural
+Gender: Female
+
+Name: es-PR-VictorNeural
+Gender: Male
+
+Name: es-PY-MarioNeural
+Gender: Male
+
+Name: es-PY-TaniaNeural
+Gender: Female
+
+Name: es-SV-LorenaNeural
+Gender: Female
+
+Name: es-SV-RodrigoNeural
+Gender: Male
+
+Name: es-US-AlonsoNeural
+Gender: Male
+
+Name: es-US-PalomaNeural
+Gender: Female
+
+Name: es-UY-MateoNeural
+Gender: Male
+
+Name: es-UY-ValentinaNeural
+Gender: Female
+
+Name: es-VE-PaolaNeural
+Gender: Female
+
+Name: es-VE-SebastianNeural
+Gender: Male
+
+Name: et-EE-AnuNeural
+Gender: Female
+
+Name: et-EE-KertNeural
+Gender: Male
+
+Name: fa-IR-DilaraNeural
+Gender: Female
+
+Name: fa-IR-FaridNeural
+Gender: Male
+
+Name: fi-FI-HarriNeural
+Gender: Male
+
+Name: fi-FI-NooraNeural
+Gender: Female
+
+Name: fil-PH-AngeloNeural
+Gender: Male
+
+Name: fil-PH-BlessicaNeural
+Gender: Female
+
+Name: fr-BE-CharlineNeural
+Gender: Female
+
+Name: fr-BE-GerardNeural
+Gender: Male
+
+Name: fr-CA-AntoineNeural
+Gender: Male
+
+Name: fr-CA-JeanNeural
+Gender: Male
+
+Name: fr-CA-SylvieNeural
+Gender: Female
+
+Name: fr-CA-ThierryNeural
+Gender: Male
+
+Name: fr-CH-ArianeNeural
+Gender: Female
+
+Name: fr-CH-FabriceNeural
+Gender: Male
+
+Name: fr-FR-DeniseNeural
+Gender: Female
+
+Name: fr-FR-EloiseNeural
+Gender: Female
+
+Name: fr-FR-HenriNeural
+Gender: Male
+
+Name: fr-FR-RemyMultilingualNeural
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural
+Gender: Female
+
+Name: ga-IE-ColmNeural
+Gender: Male
+
+Name: ga-IE-OrlaNeural
+Gender: Female
+
+Name: gl-ES-RoiNeural
+Gender: Male
+
+Name: gl-ES-SabelaNeural
+Gender: Female
+
+Name: gu-IN-DhwaniNeural
+Gender: Female
+
+Name: gu-IN-NiranjanNeural
+Gender: Male
+
+Name: he-IL-AvriNeural
+Gender: Male
+
+Name: he-IL-HilaNeural
+Gender: Female
+
+Name: hi-IN-MadhurNeural
+Gender: Male
+
+Name: hi-IN-SwaraNeural
+Gender: Female
+
+Name: hr-HR-GabrijelaNeural
+Gender: Female
+
+Name: hr-HR-SreckoNeural
+Gender: Male
+
+Name: hu-HU-NoemiNeural
+Gender: Female
+
+Name: hu-HU-TamasNeural
+Gender: Male
+
+Name: id-ID-ArdiNeural
+Gender: Male
+
+Name: id-ID-GadisNeural
+Gender: Female
+
+Name: is-IS-GudrunNeural
+Gender: Female
+
+Name: is-IS-GunnarNeural
+Gender: Male
+
+Name: it-IT-DiegoNeural
+Gender: Male
+
+Name: it-IT-ElsaNeural
+Gender: Female
+
+Name: it-IT-GiuseppeNeural
+Gender: Male
+
+Name: it-IT-IsabellaNeural
+Gender: Female
+
+Name: ja-JP-KeitaNeural
+Gender: Male
+
+Name: ja-JP-NanamiNeural
+Gender: Female
+
+Name: jv-ID-DimasNeural
+Gender: Male
+
+Name: jv-ID-SitiNeural
+Gender: Female
+
+Name: ka-GE-EkaNeural
+Gender: Female
+
+Name: ka-GE-GiorgiNeural
+Gender: Male
+
+Name: kk-KZ-AigulNeural
+Gender: Female
+
+Name: kk-KZ-DauletNeural
+Gender: Male
+
+Name: km-KH-PisethNeural
+Gender: Male
+
+Name: km-KH-SreymomNeural
+Gender: Female
+
+Name: kn-IN-GaganNeural
+Gender: Male
+
+Name: kn-IN-SapnaNeural
+Gender: Female
+
+Name: ko-KR-HyunsuNeural
+Gender: Male
+
+Name: ko-KR-InJoonNeural
+Gender: Male
+
+Name: ko-KR-SunHiNeural
+Gender: Female
+
+Name: lo-LA-ChanthavongNeural
+Gender: Male
+
+Name: lo-LA-KeomanyNeural
+Gender: Female
+
+Name: lt-LT-LeonasNeural
+Gender: Male
+
+Name: lt-LT-OnaNeural
+Gender: Female
+
+Name: lv-LV-EveritaNeural
+Gender: Female
+
+Name: lv-LV-NilsNeural
+Gender: Male
+
+Name: mk-MK-AleksandarNeural
+Gender: Male
+
+Name: mk-MK-MarijaNeural
+Gender: Female
+
+Name: ml-IN-MidhunNeural
+Gender: Male
+
+Name: ml-IN-SobhanaNeural
+Gender: Female
+
+Name: mn-MN-BataaNeural
+Gender: Male
+
+Name: mn-MN-YesuiNeural
+Gender: Female
+
+Name: mr-IN-AarohiNeural
+Gender: Female
+
+Name: mr-IN-ManoharNeural
+Gender: Male
+
+Name: ms-MY-OsmanNeural
+Gender: Male
+
+Name: ms-MY-YasminNeural
+Gender: Female
+
+Name: mt-MT-GraceNeural
+Gender: Female
+
+Name: mt-MT-JosephNeural
+Gender: Male
+
+Name: my-MM-NilarNeural
+Gender: Female
+
+Name: my-MM-ThihaNeural
+Gender: Male
+
+Name: nb-NO-FinnNeural
+Gender: Male
+
+Name: nb-NO-PernilleNeural
+Gender: Female
+
+Name: ne-NP-HemkalaNeural
+Gender: Female
+
+Name: ne-NP-SagarNeural
+Gender: Male
+
+Name: nl-BE-ArnaudNeural
+Gender: Male
+
+Name: nl-BE-DenaNeural
+Gender: Female
+
+Name: nl-NL-ColetteNeural
+Gender: Female
+
+Name: nl-NL-FennaNeural
+Gender: Female
+
+Name: nl-NL-MaartenNeural
+Gender: Male
+
+Name: pl-PL-MarekNeural
+Gender: Male
+
+Name: pl-PL-ZofiaNeural
+Gender: Female
+
+Name: ps-AF-GulNawazNeural
+Gender: Male
+
+Name: ps-AF-LatifaNeural
+Gender: Female
+
+Name: pt-BR-AntonioNeural
+Gender: Male
+
+Name: pt-BR-FranciscaNeural
+Gender: Female
+
+Name: pt-BR-ThalitaNeural
+Gender: Female
+
+Name: pt-PT-DuarteNeural
+Gender: Male
+
+Name: pt-PT-RaquelNeural
+Gender: Female
+
+Name: ro-RO-AlinaNeural
+Gender: Female
+
+Name: ro-RO-EmilNeural
+Gender: Male
+
+Name: ru-RU-DmitryNeural
+Gender: Male
+
+Name: ru-RU-SvetlanaNeural
+Gender: Female
+
+Name: si-LK-SameeraNeural
+Gender: Male
+
+Name: si-LK-ThiliniNeural
+Gender: Female
+
+Name: sk-SK-LukasNeural
+Gender: Male
+
+Name: sk-SK-ViktoriaNeural
+Gender: Female
+
+Name: sl-SI-PetraNeural
+Gender: Female
+
+Name: sl-SI-RokNeural
+Gender: Male
+
+Name: so-SO-MuuseNeural
+Gender: Male
+
+Name: so-SO-UbaxNeural
+Gender: Female
+
+Name: sq-AL-AnilaNeural
+Gender: Female
+
+Name: sq-AL-IlirNeural
+Gender: Male
+
+Name: sr-RS-NicholasNeural
+Gender: Male
+
+Name: sr-RS-SophieNeural
+Gender: Female
+
+Name: su-ID-JajangNeural
+Gender: Male
+
+Name: su-ID-TutiNeural
+Gender: Female
+
+Name: sv-SE-MattiasNeural
+Gender: Male
+
+Name: sv-SE-SofieNeural
+Gender: Female
+
+Name: sw-KE-RafikiNeural
+Gender: Male
+
+Name: sw-KE-ZuriNeural
+Gender: Female
+
+Name: sw-TZ-DaudiNeural
+Gender: Male
+
+Name: sw-TZ-RehemaNeural
+Gender: Female
+
+Name: ta-IN-PallaviNeural
+Gender: Female
+
+Name: ta-IN-ValluvarNeural
+Gender: Male
+
+Name: ta-LK-KumarNeural
+Gender: Male
+
+Name: ta-LK-SaranyaNeural
+Gender: Female
+
+Name: ta-MY-KaniNeural
+Gender: Female
+
+Name: ta-MY-SuryaNeural
+Gender: Male
+
+Name: ta-SG-AnbuNeural
+Gender: Male
+
+Name: ta-SG-VenbaNeural
+Gender: Female
+
+Name: te-IN-MohanNeural
+Gender: Male
+
+Name: te-IN-ShrutiNeural
+Gender: Female
+
+Name: th-TH-NiwatNeural
+Gender: Male
+
+Name: th-TH-PremwadeeNeural
+Gender: Female
+
+Name: tr-TR-AhmetNeural
+Gender: Male
+
+Name: tr-TR-EmelNeural
+Gender: Female
+
+Name: uk-UA-OstapNeural
+Gender: Male
+
+Name: uk-UA-PolinaNeural
+Gender: Female
+
+Name: ur-IN-GulNeural
+Gender: Female
+
+Name: ur-IN-SalmanNeural
+Gender: Male
+
+Name: ur-PK-AsadNeural
+Gender: Male
+
+Name: ur-PK-UzmaNeural
+Gender: Female
+
+Name: uz-UZ-MadinaNeural
+Gender: Female
+
+Name: uz-UZ-SardorNeural
+Gender: Male
+
+Name: vi-VN-HoaiMyNeural
+Gender: Female
+
+Name: vi-VN-NamMinhNeural
+Gender: Male
+
+Name: zh-CN-XiaoxiaoNeural
+Gender: Female
+
+Name: zh-CN-XiaoyiNeural
+Gender: Female
+
+Name: zh-CN-YunjianNeural
+Gender: Male
+
+Name: zh-CN-YunxiNeural
+Gender: Male
+
+Name: zh-CN-YunxiaNeural
+Gender: Male
+
+Name: zh-CN-YunyangNeural
+Gender: Male
+
+Name: zh-CN-liaoning-XiaobeiNeural
+Gender: Female
+
+Name: zh-CN-shaanxi-XiaoniNeural
+Gender: Female
+
+Name: zh-HK-HiuGaaiNeural
+Gender: Female
+
+Name: zh-HK-HiuMaanNeural
+Gender: Female
+
+Name: zh-HK-WanLungNeural
+Gender: Male
+
+Name: zh-TW-HsiaoChenNeural
+Gender: Female
+
+Name: zh-TW-HsiaoYuNeural
+Gender: Female
+
+Name: zh-TW-YunJheNeural
+Gender: Male
+
+Name: zu-ZA-ThandoNeural
+Gender: Female
+
+Name: zu-ZA-ThembaNeural
+Gender: Male
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfec175bea6e741c9ba1ffed3052aca15599a8a6
--- /dev/null
+++ b/main.py
@@ -0,0 +1,19 @@
+import os
+import uvicorn
+from loguru import logger
+
+from app.config import config
+
+if __name__ == "__main__":
+    logger.info(
+        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
+    )
+    os.environ["HTTP_PROXY"] = config.proxy.get("http")
+    os.environ["HTTPS_PROXY"] = config.proxy.get("https")
+    uvicorn.run(
+        app="app.asgi:app",
+        host=config.listen_host,
+        port=config.listen_port,
+        reload=config.reload_debug,
+        log_level="warning",
+    )
diff --git a/project_version b/project_version
new file mode 100644
index 0000000000000000000000000000000000000000..a0a15177f39314e2c4a95e4982f4dc0bf2f757a8
--- /dev/null
+++ b/project_version
@@ -0,0 +1 @@
+0.6.3
\ No newline at end of file
diff --git a/release-notes.md b/release-notes.md
new file mode 100644
index 0000000000000000000000000000000000000000..d290698c75e3f2c8a31786a2abd9c9cfe085a3d4
--- /dev/null
+++ b/release-notes.md
@@ -0,0 +1,17 @@
+# Release Notes
+
+## Latest Changes
+
+* docs(README): 更新README. PR [#138](https://github.com/linyqh/NarratoAI/pull/138) by [@linyqh](https://github.com/linyqh).
+* Dev 0.6.0. PR [#137](https://github.com/linyqh/NarratoAI/pull/137) by [@linyqh](https://github.com/linyqh).
+* Dev 0.6.0 . PR [#134](https://github.com/linyqh/NarratoAI/pull/134) by [@linyqh](https://github.com/linyqh).
+* Dev-0.3.9. PR [#73](https://github.com/linyqh/NarratoAI/pull/73) by [@linyqh](https://github.com/linyqh).
+* 0.3.9 版本发布. PR [#71](https://github.com/linyqh/NarratoAI/pull/71) by [@linyqh](https://github.com/linyqh).
+* docs: add Japanese README. PR [#66](https://github.com/linyqh/NarratoAI/pull/66) by [@eltociear](https://github.com/eltociear).
+* docs: 测试 release 2. PR [#62](https://github.com/linyqh/NarratoAI/pull/62) by [@linyqh](https://github.com/linyqh).
+* docs: 测试 release. PR [#61](https://github.com/linyqh/NarratoAI/pull/61) by [@linyqh](https://github.com/linyqh).
+* docs: 测试commit. PR [#60](https://github.com/linyqh/NarratoAI/pull/60) by [@linyqh](https://github.com/linyqh).
+* Dev. PR [#59](https://github.com/linyqh/NarratoAI/pull/59) by [@linyqh](https://github.com/linyqh).
+* 0.2.0新版预发布. PR [#37](https://github.com/linyqh/NarratoAI/pull/37) by [@linyqh](https://github.com/linyqh).
+* v0.3.6. PR [#58](https://github.com/linyqh/NarratoAI/pull/58) by [@linyqh](https://github.com/linyqh).
+* 0.3.4 修改各种bug. PR [#49](https://github.com/linyqh/NarratoAI/pull/49) by [@linyqh](https://github.com/linyqh).
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cddc9b11dc61814ad4f16b3686c126d155b3aced
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,47 @@
+# 必须项
+requests~=2.32.0
+moviepy==2.1.1
+edge-tts==6.1.19
+streamlit~=1.45.0
+watchdog==6.0.0
+loguru~=0.7.3
+tomli~=2.2.1
+pydub==0.25.1
+pysrt==1.1.2
+
+openai~=1.77.0
+google-generativeai>=0.8.5
+
+# 待优化项
+# opencv-python==4.11.0.86
+# scikit-learn==1.6.1
+
+# fastapi~=0.115.4
+# uvicorn~=0.27.1
+# pydantic~=2.11.4
+
+# faster-whisper~=1.0.1
+# tomli~=2.0.1
+# aiohttp~=3.10.10
+# httpx==0.27.2
+# urllib3~=2.2.1
+
+# python-multipart~=0.0.9
+# redis==5.0.3
+# opencv-python~=4.10.0.84
+# azure-cognitiveservices-speech~=1.37.0
+# git-changelog~=2.5.2
+# watchdog==5.0.2
+# pydub==0.25.1
+# psutil>=5.9.0
+# scikit-learn~=1.5.2
+# pillow==10.3.0
+# python-dotenv~=1.0.1
+
+# tqdm>=4.66.6
+# tenacity>=9.0.0
+# tiktoken==0.8.0
+# pysrt==1.1.2
+# transformers==4.50.0
+
+# yt-dlp==2025.4.30
\ No newline at end of file
diff --git a/resource/fonts/fonts_in_here.txt b/resource/fonts/fonts_in_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8858c694f102e94feea5e69d01ad14b2e735982f
--- /dev/null
+++ b/resource/fonts/fonts_in_here.txt
@@ -0,0 +1 @@
+此处放字体文件
\ No newline at end of file
diff --git a/resource/public/index.html b/resource/public/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..9de1f1c086ebdc79b9d46c6fdadc53d5fa655079
--- /dev/null
+++ b/resource/public/index.html
@@ -0,0 +1,25 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>NarratoAI</title>
+</head>
+<body>
+<h1>NarratoAI</h1>
+<a href="https://github.com/linyqh/NarratoAI">项目地址：https://github.com/linyqh/NarratoAI</a>
+<hr>
+</hr>
+<a href="http://127.0.0.1:8501">webui 地址：http://127.0.0.1:8501</a>
+<br>
+<a href="http://127.0.0.1:8080/docs">api swagger 地址：http://127.0.0.1:8080/docs</a>
+<hr>
+</hr>
+<p>
+    NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程，助力高效内容创作。
+</p>
+
+<p>
+    NarratoAI is an automated film and television commentary tool that implements a one-stop process of copywriting, automated video editing, dubbing and subtitle generation based on LLM, facilitating efficient content creation.
+</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/resource/scripts/script_in_here.txt b/resource/scripts/script_in_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/resource/songs/song_in_here.txt b/resource/songs/song_in_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/resource/srt/srt_in_here.txt b/resource/srt/srt_in_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/resource/videos/video_in_here.txt b/resource/videos/video_in_here.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/video_pipeline.py b/video_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc7fa26370a878823460a0fb0b68d39b7ce77ad6
--- /dev/null
+++ b/video_pipeline.py
@@ -0,0 +1,178 @@
+import requests
+import json
+import os
+import time
+from typing import Dict, Any
+
+class VideoPipeline:
+    def __init__(self, base_url: str = "http://127.0.0.1:8080"):
+        self.base_url = base_url
+        
+    def download_video(self, url: str, resolution: str = "1080p", 
+                      output_format: str = "mp4", rename: str = None) -> Dict[str, Any]:
+        """下载视频的第一步"""
+        endpoint = f"{self.base_url}/api/v2/youtube/download"
+        payload = {
+            "url": url,
+            "resolution": resolution,
+            "output_format": output_format,
+            "rename": rename or time.strftime("%Y-%m-%d")
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_script(self, video_path: str, skip_seconds: int = 0,
+                       threshold: int = 30, vision_batch_size: int = 10,
+                       vision_llm_provider: str = "gemini") -> Dict[str, Any]:
+        """生成脚本的第二步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/generate"
+        payload = {
+            "video_path": video_path,
+            "skip_seconds": skip_seconds,
+            "threshold": threshold,
+            "vision_batch_size": vision_batch_size,
+            "vision_llm_provider": vision_llm_provider
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def crop_video(self, video_path: str, script: list) -> Dict[str, Any]:
+        """剪辑视频的第三步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/crop"
+        payload = {
+            "video_origin_path": video_path,
+            "video_script": script
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_final_video(self, task_id: str, video_path: str, 
+                           script_path: str, script: list, subclip_videos: Dict[str, str], voice_name: str) -> Dict[str, Any]:
+        """生成最终视频的第四步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/start-subclip"
+        
+        request_data = {
+            "video_clip_json": script,
+            "video_clip_json_path": script_path,
+            "video_origin_path": video_path,
+            "video_aspect": "16:9",
+            "video_language": "zh-CN",
+            "voice_name": voice_name,
+            "voice_volume": 1,
+            "voice_rate": 1.2,
+            "voice_pitch": 1,
+            "bgm_name": "random",
+            "bgm_type": "random",
+            "bgm_file": "",
+            "bgm_volume": 0.3,
+            "subtitle_enabled": True,
+            "subtitle_position": "bottom",
+            "font_name": "STHeitiMedium.ttc",
+            "text_fore_color": "#FFFFFF",
+            "text_background_color": "transparent",
+            "font_size": 75,
+            "stroke_color": "#000000",
+            "stroke_width": 1.5,
+            "custom_position": 70,
+            "n_threads": 8
+        }
+        
+        payload = {
+            "request": request_data,
+            "subclip_videos": subclip_videos
+        }
+        
+        params = {"task_id": task_id}
+        response = requests.post(endpoint, params=params, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def save_script_to_json(self, script: list, script_path: str) -> str:
+        """保存脚本到json文件"""        
+        try:
+            with open(script_path, 'w', encoding='utf-8') as f:
+                json.dump(script, f, ensure_ascii=False, indent=2)
+            print(f"脚本已保存到: {script_path}")
+            return script_path
+        except Exception as e:
+            print(f"保存脚本失败: {str(e)}")
+            raise
+    
+    def run_pipeline(self, task_id: str, script_name: str, youtube_url: str, video_name: str="null", skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 10, vision_llm_provider: str = "gemini", voice_name: str = "zh-CN-YunjianNeural") -> Dict[str, Any]:
+        """运行完整的pipeline"""
+        try:
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            video_path = os.path.join(current_path, "resource", "videos", f"{video_name}.mp4")
+            # 判断视频是否存在
+            if not os.path.exists(video_path):
+                # 1. 下载视频
+                print(f"视频不存在, 开始下载视频: {video_path}")
+                download_result = self.download_video(url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
+                video_path = download_result["output_path"]
+            else:
+                print(f"视频已存在: {video_path}")
+            
+            # 2. 判断script_name是否存在
+            # 2.1.1 拼接脚本路径 NarratoAI/resource/scripts
+            script_path = os.path.join(current_path, "resource", "scripts", script_name)
+            if os.path.exists(script_path):
+                script = json.load(open(script_path, "r", encoding="utf-8"))
+            else:
+                # 2.1.2 生成脚本
+                print("开始生成脚本...")
+                script_result = self.generate_script(video_path=video_path, skip_seconds=skip_seconds, threshold=threshold, vision_batch_size=vision_batch_size, vision_llm_provider=vision_llm_provider)
+                script = script_result["script"]
+            
+            # 2.2 保存脚本到json文件
+            print("保存脚本到json文件...")
+            self.save_script_to_json(script=script, script_path=script_path)
+            
+            # 3. 剪辑视频
+            print("开始剪辑视频...")
+            crop_result = self.crop_video(video_path=video_path, script=script)
+            subclip_videos = crop_result["subclip_videos"]
+            
+            # 4. 生成最终视频
+            print("开始生成最终视频...")
+            self.generate_final_video(
+                task_id=task_id,
+                video_path=video_path,
+                script_path=script_path,
+                script=script,
+                subclip_videos=subclip_videos,
+                voice_name=voice_name
+            )
+            
+            return {
+                "status": "等待异步生成视频",
+                "path": os.path.join(current_path, "storage", "tasks", task_id)
+            }
+            
+        except Exception as e:
+            return {
+                "status": "error",
+                "error": str(e)
+            }
+
+
+# 使用示例
+if __name__ == "__main__":
+    pipeline = VideoPipeline()
+    result = pipeline.run_pipeline(
+        task_id="test_111901",
+        script_name="test.json",
+        youtube_url="https://www.youtube.com/watch?v=vLJ7Yed6FQ4",
+        video_name="2024-11-19-01",
+        skip_seconds=50,
+        threshold=35,
+        vision_batch_size=10,
+        vision_llm_provider="gemini",
+        voice_name="zh-CN-YunjianNeural",
+    )
+    print(result)
diff --git a/webui.py b/webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8283839805fa8136b1117293150d54c10ca226
--- /dev/null
+++ b/webui.py
@@ -0,0 +1,249 @@
+import streamlit as st
+import os
+import sys
+from loguru import logger
+from app.config import config
+from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
+    review_settings, merge_settings, system_settings
+# from webui.utils import cache, file_utils
+from app.utils import utils
+from app.utils import ffmpeg_utils
+from app.models.schema import VideoClipParams, VideoAspect
+
+
+# 初始化配置 - 必须是第一个 Streamlit 命令
+st.set_page_config(
+    page_title="NarratoAI",
+    page_icon="📽️",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items={
+        "Report a bug": "https://github.com/linyqh/NarratoAI/issues",
+        'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
+                 f"自动化影视解说视频详情请移步：https://github.com/linyqh/NarratoAI"
+    },
+)
+
+# 设置页面样式
+hide_streamlit_style = """
+<style>#root > div:nth-child(1) > div > div > div > div > section > div {padding-top: 6px; padding-bottom: 10px; padding-left: 20px; padding-right: 20px;}</style>
+"""
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+
+
+def init_log():
+    """初始化日志配置"""
+    from loguru import logger
+    logger.remove()
+    _lvl = "DEBUG"
+
+    def format_record(record):
+        # 简化日志格式化处理，不尝试按特定字符串过滤torch相关内容
+        file_path = record["file"].path
+        relative_path = os.path.relpath(file_path, config.root_dir)
+        record["file"].path = f"./{relative_path}"
+        record['message'] = record['message'].replace(config.root_dir, ".")
+
+        _format = '<green>{time:%Y-%m-%d %H:%M:%S}</> | ' + \
+                  '<level>{level}</> | ' + \
+                  '"{file.path}:{line}":<blue> {function}</> ' + \
+                  '- <level>{message}</>' + "\n"
+        return _format
+
+    # 替换为更简单的过滤方式，避免在过滤时访问message内容
+    # 此处先不设置复杂的过滤器，等应用启动后再动态添加
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True
+    )
+
+    # 应用启动后，可以再添加更复杂的过滤器
+    def setup_advanced_filters():
+        """在应用完全启动后设置高级过滤器"""
+        try:
+            for handler_id in logger._core.handlers:
+                logger.remove(handler_id)
+
+            # 重新添加带有高级过滤的处理器
+            def advanced_filter(record):
+                """更复杂的过滤器，在应用启动后安全使用"""
+                ignore_messages = [
+                    "Examining the path of torch.classes raised",
+                    "torch.cuda.is_available()",
+                    "CUDA initialization"
+                ]
+                return not any(msg in record["message"] for msg in ignore_messages)
+
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True,
+                filter=advanced_filter
+            )
+        except Exception as e:
+            # 如果过滤器设置失败，确保日志仍然可用
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True
+            )
+            logger.error(f"设置高级日志过滤器失败: {e}")
+
+    # 将高级过滤器设置放到启动主逻辑后
+    import threading
+    threading.Timer(5.0, setup_advanced_filters).start()
+
+
+def init_global_state():
+    """初始化全局状态"""
+    if 'video_clip_json' not in st.session_state:
+        st.session_state['video_clip_json'] = []
+    if 'video_plot' not in st.session_state:
+        st.session_state['video_plot'] = ''
+    if 'ui_language' not in st.session_state:
+        st.session_state['ui_language'] = config.ui.get("language", utils.get_system_locale())
+    if 'subclip_videos' not in st.session_state:
+        st.session_state['subclip_videos'] = {}
+
+
+def tr(key):
+    """翻译函数"""
+    i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n")
+    locales = utils.load_locales(i18n_dir)
+    loc = locales.get(st.session_state['ui_language'], {})
+    return loc.get("Translation", {}).get(key, key)
+
+
+def render_generate_button():
+    """渲染生成按钮和处理逻辑"""
+    if st.button(tr("Generate Video"), use_container_width=True, type="primary"):
+        from app.services import task as tm
+
+        # 重置日志容器和记录
+        log_container = st.empty()
+        log_records = []
+
+        def log_received(msg):
+            with log_container:
+                log_records.append(msg)
+                st.code("\n".join(log_records))
+
+        from loguru import logger
+        logger.add(log_received)
+
+        config.save_config()
+        task_id = st.session_state.get('task_id')
+
+        if not task_id:
+            st.error(tr("请先裁剪视频"))
+            return
+        if not st.session_state.get('video_clip_json_path'):
+            st.error(tr("脚本文件不能为空"))
+            return
+        if not st.session_state.get('video_origin_path'):
+            st.error(tr("视频文件不能为空"))
+            return
+
+        st.toast(tr("生成视频"))
+        logger.info(tr("开始生成视频"))
+
+        # 获取所有参数
+        script_params = script_settings.get_script_params()
+        video_params = video_settings.get_video_params()
+        audio_params = audio_settings.get_audio_params()
+        subtitle_params = subtitle_settings.get_subtitle_params()
+
+        # 合并所有参数
+        all_params = {
+            **script_params,
+            **video_params,
+            **audio_params,
+            **subtitle_params
+        }
+
+        # 创建参数对象
+        params = VideoClipParams(**all_params)
+
+        result = tm.start_subclip(
+            task_id=task_id,
+            params=params,
+            subclip_path_videos=st.session_state['subclip_videos']
+        )
+
+        video_files = result.get("videos", [])
+        st.success(tr("视生成完成"))
+
+        try:
+            if video_files:
+                player_cols = st.columns(len(video_files) * 2 + 1)
+                for i, url in enumerate(video_files):
+                    player_cols[i * 2 + 1].video(url)
+        except Exception as e:
+            logger.error(f"播放视频失败: {e}")
+
+        # file_utils.open_task_folder(config.root_dir, task_id)
+        logger.info(tr("视频生成完成"))
+
+
+# 全局变量，记录是否已经打印过硬件加速信息
+_HAS_LOGGED_HWACCEL_INFO = False
+
+def main():
+    """主函数"""
+    global _HAS_LOGGED_HWACCEL_INFO
+    init_log()
+    init_global_state()
+
+    # 检测FFmpeg硬件加速，但只打印一次日志
+    hwaccel_info = ffmpeg_utils.detect_hardware_acceleration()
+    if not _HAS_LOGGED_HWACCEL_INFO:
+        if hwaccel_info["available"]:
+            logger.info(f"FFmpeg硬件加速检测结果: 可用 | 类型: {hwaccel_info['type']} | 编码器: {hwaccel_info['encoder']} | 独立显卡: {hwaccel_info['is_dedicated_gpu']} | 参数: {hwaccel_info['hwaccel_args']}")
+        else:
+            logger.warning(f"FFmpeg硬件加速不可用: {hwaccel_info['message']}, 将使用CPU软件编码")
+        _HAS_LOGGED_HWACCEL_INFO = True
+
+    # 仅初始化基本资源，避免过早地加载依赖PyTorch的资源
+    # 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源)
+    try:
+        utils.init_resources()
+    except Exception as e:
+        logger.warning(f"资源初始化时出现警告: {e}")
+
+    st.title(f"Narrato:blue[AI]:sunglasses: 📽️")
+    st.write(tr("Get Help"))
+
+    # 首先渲染不依赖PyTorch的UI部分
+    # 渲染基础设置面板
+    basic_settings.render_basic_settings(tr)
+    # 渲染合并设置
+    merge_settings.render_merge_settings(tr)
+
+    # 渲染主面板
+    panel = st.columns(3)
+    with panel[0]:
+        script_settings.render_script_panel(tr)
+    with panel[1]:
+        video_settings.render_video_panel(tr)
+        audio_settings.render_audio_panel(tr)
+    with panel[2]:
+        subtitle_settings.render_subtitle_panel(tr)
+
+    # 渲染视频审查面板
+    review_settings.render_review_panel(tr)
+
+    # 放到最后渲染可能使用PyTorch的部分
+    # 渲染系统设置面板
+    with panel[2]:
+        system_settings.render_system_panel(tr)
+
+    # 放到最后渲染生成按钮和处理逻辑
+    render_generate_button()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/webui.txt b/webui.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b26a7232866feede933981b08fa35c2fe3286917
--- /dev/null
+++ b/webui.txt
@@ -0,0 +1,376 @@
+@echo off
+set CURRENT_DIR=%CD%
+echo ***** Current directory: %CURRENT_DIR% *****
+set PYTHONPATH=%CURRENT_DIR%
+
+set "vpn_proxy_url=%http://127.0.0.1:7890%"
+
+:: 使用VPN代理进行一些操作，例如通过代理下载文件
+set "http_proxy=%vpn_proxy_url%"
+set "https_proxy=%vpn_proxy_url%"
+
+@echo off
+setlocal enabledelayedexpansion
+
+rem 创建链接和路径的数组
+set "urls_paths[0]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiBold.ttc|.\resource\fonts"
+set "urls_paths[1]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiNormal.ttc|.\resource\fonts"
+set "urls_paths[2]=https://zenodo.org/records/13293144/files/STHeitiLight.ttc|.\resource\fonts"
+set "urls_paths[3]=https://zenodo.org/records/13293144/files/STHeitiMedium.ttc|.\resource\fonts"
+set "urls_paths[4]=https://zenodo.org/records/13293144/files/UTM%20Kabel%20KT.ttf|.\resource\fonts"
+set "urls_paths[5]=https://zenodo.org/records/14167125/files/test.mp4|.\resource\videos"
+set "urls_paths[6]=https://zenodo.org/records/13293150/files/output000.mp3|.\resource\songs"
+set "urls_paths[7]=https://zenodo.org/records/13293150/files/output001.mp3|.\resource\songs"
+set "urls_paths[8]=https://zenodo.org/records/13293150/files/output002.mp3|.\resource\songs"
+set "urls_paths[9]=https://zenodo.org/records/13293150/files/output003.mp3|.\resource\songs"
+set "urls_paths[10]=https://zenodo.org/records/13293150/files/output004.mp3|.\resource\songs"
+set "urls_paths[11]=https://zenodo.org/records/13293150/files/output005.mp3|.\resource\songs"
+set "urls_paths[12]=https://zenodo.org/records/13293150/files/output006.mp3|.\resource\songs"
+set "urls_paths[13]=https://zenodo.org/records/13293150/files/output007.mp3|.\resource\songs"
+set "urls_paths[14]=https://zenodo.org/records/13293150/files/output008.mp3|.\resource\songs"
+set "urls_paths[15]=https://zenodo.org/records/13293150/files/output009.mp3|.\resource\songs"
+set "urls_paths[16]=https://zenodo.org/records/13293150/files/output010.mp3|.\resource\songs"
+
+rem 循环下载所有文件并保存到指定路径
+for /L %%i in (0,1,16) do (
+    for /f "tokens=1,2 delims=|" %%a in ("!urls_paths[%%i]!") do (
+        if not exist "%%b" mkdir "%%b"
+        echo 正在下载 %%a 到 %%b
+        curl -o "%%b\%%~nxa" %%a
+    )
+)
+
+echo 所有文件已成功下载到指定目录
+endlocal
+pause
+
+
+rem set HF_ENDPOINT=https://hf-mirror.com
+streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
+
+streamlit run webui.py --server.maxUploadSize=2048
+
+请求0：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}
+
+请求1：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}
+
+请求2：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+
+请求3：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+
+
+请在最外层新建一个pipeline 工作流执行逻辑的代码；
+他会按照下面的顺序请求接口
+1.下载视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+2.生成脚本
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+3. 剪辑视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+4.生成视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+
+请求1，返回的参数是：
+{
+  "task_id": "4e9b575f-68c0-4ae1-b218-db42b67993d0",
+  "output_path": "E:\\projects\\NarratoAI\\resource\\videos\\2024-11-19.mp4",
+  "resolution": "1080p",
+  "format": "mp4",
+  "filename": "2024-11-19.mp4"
+}
+output_path需要传递给请求2
+请求2，返回数据为：
+{
+  "task_id": "04497017-953c-44b4-bf1d-9d8ed3ebbbce",
+  "script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是對影片畫面的客觀描述：\n\n影片顯示一名留著鬍鬚的男子在一處樹林茂密的斜坡上挖掘。\n\n畫面一：男子從後方出現，背著一個軍綠色的背包，背包裡似乎裝有工具。他穿著卡其色的長褲和深色的登山鞋。\n\n畫面二：特寫鏡頭顯示男子的背包，一個舊的鎬頭從包裡露出來，包裡還有其他工具，包括一個鏟子。\n\n畫面三：男子用鎬頭在斜坡上挖土，背包放在他旁邊。\n\n畫面四：特寫鏡頭顯示男子的登山鞋在泥土中。\n\n畫面五：男子坐在斜坡上，用手清理樹根和泥土。\n\n畫面六：地上有一些鬆動的泥土和落葉。\n\n畫面七：男子的背包近景鏡頭，他正在挖掘。\n\n畫面八：男子在斜坡上挖掘，揚起一陣塵土。\n\n畫面九：特寫鏡頭顯示男子用手清理泥土。\n\n畫面十：特寫鏡頭顯示挖出的泥土剖面，可以看到土壤的層次。",
+      "narration": "上一个画面是我在绝美的自然中，准备开启我的“土豪”挖掘之旅。现在，你们看到这位留着胡子的“大哥”，他背着个军绿色的包，里面装的可不仅仅是工具，还有我对生活的无限热爱（以及一丝不安）。看！这把旧镐头就像我的前任——用起来费劲，但又舍不得扔掉。\n\n他在斜坡上挖土，泥土飞扬，仿佛在跟大地进行一场“泥巴大战”。每一铲下去，都能听到大地微微的呻吟：哎呀，我这颗小树根可比我当年的情感纠葛还难处理呢！别担心，这些泥土层次分明，简直可以开个“泥土博物馆”。所以，朋友们，跟着我一起享受这场泥泞中的乐趣吧！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是對影片畫面內容的客觀描述：\n\n影片以一系列森林環境的鏡頭開始。第一個鏡頭展示了綠葉植物的特寫鏡頭，葉子上有一些水珠。接下來的鏡頭是一個男人在森林裡挖掘一個小坑，他跪在地上，用鏟子挖土。\n\n接下來的鏡頭是同一個男人坐在他挖的坑旁邊，望著前方。然後，鏡頭顯示該坑的廣角鏡頭，顯示其結構和大小。\n\n之後的鏡頭，同一個男人在樹林裡劈柴。鏡頭最後呈現出一潭渾濁的水，周圍環繞著樹枝。然後鏡頭又回到了森林裡生長茂盛的植物特寫鏡頭。",
+      "narration": "好嘞，朋友们，我们已经在泥土博物馆里捣鼓了一阵子，现在是时候跟大自然亲密接触了！看看这片森林，绿叶上水珠闪闪发光，就像我曾经的爱情，虽然短暂，却美得让人心碎。\n\n现在，我在这里挖个小坑，感觉自己就像是一位新晋“挖土大王”，不过说实话，这手艺真不敢恭维，连铲子都快对我崩溃了。再说劈柴，这动作简直比我前任的情绪波动还要激烈！最后这一潭浑浊的水，别担心，它只是告诉我：生活就像这水，总有些杂质，但也别忘了，要勇敢面对哦！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+output_path和script参数需要传递给请求3
+请求3返回参数是
+{
+  "task_id": "b6f5a98a-b2e0-4e3d-89c5-64fb90db2ec1",
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+subclip_videos和 output_path和script参数需要传递给请求4
+最后完成工作流
+
+0代表只播放文案音频，禁用视频原声；1代表只播放视频原声，不需要播放文案音频和字幕；2代表即播放文案音频也要播放视频原声；
\ No newline at end of file
diff --git a/webui/__init__.py b/webui/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5f92ee5c62a26c307fd35f39954104963435d3
--- /dev/null
+++ b/webui/__init__.py
@@ -0,0 +1,21 @@
+"""
+NarratoAI WebUI Package
+"""
+from webui.config.settings import config
+from webui.components import (
+    basic_settings,
+    video_settings,
+    audio_settings,
+    subtitle_settings
+)
+from webui.utils import cache, file_utils
+
+__all__ = [
+    'config',
+    'basic_settings',
+    'video_settings',
+    'audio_settings',
+    'subtitle_settings',
+    'cache',
+    'file_utils'
+] 
\ No newline at end of file
diff --git a/webui/components/__init__.py b/webui/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aafcd77e9fe73c4fb8719b551faa9f82cca1d87
--- /dev/null
+++ b/webui/components/__init__.py
@@ -0,0 +1,15 @@
+from .basic_settings import render_basic_settings
+from .script_settings import render_script_panel
+from .video_settings import render_video_panel
+from .audio_settings import render_audio_panel
+from .subtitle_settings import render_subtitle_panel
+from .review_settings import render_review_panel
+
+__all__ = [
+    'render_basic_settings',
+    'render_script_panel',
+    'render_video_panel',
+    'render_audio_panel',
+    'render_subtitle_panel',
+    'render_review_panel'
+] 
\ No newline at end of file
diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..a58ca600fa39b4b37cf7b4cf41baa788f275eed9
--- /dev/null
+++ b/webui/components/audio_settings.py
@@ -0,0 +1,212 @@
+import streamlit as st
+import os
+from uuid import uuid4
+from app.config import config
+from app.services import voice
+from app.utils import utils
+from webui.utils.cache import get_songs_cache
+
+
+def render_audio_panel(tr):
+    """渲染音频设置面板"""
+    with st.container(border=True):
+        st.write(tr("Audio Settings"))
+
+        # 渲染TTS设置
+        render_tts_settings(tr)
+
+        # 渲染背景音乐设置
+        render_bgm_settings(tr)
+
+
+def render_tts_settings(tr):
+    """渲染TTS(文本转语音)设置"""
+    # 获取支持的语音列表
+    support_locales = ["zh-CN", "en-US"]
+    voices = voice.get_all_azure_voices(filter_locals=support_locales)
+
+    # 创建友好的显示名称
+    friendly_names = {
+        v: v.replace("Female", tr("Female"))
+        .replace("Male", tr("Male"))
+        .replace("Neural", "")
+        for v in voices
+    }
+
+    # 获取保存的语音设置
+    saved_voice_name = config.ui.get("voice_name", "")
+    saved_voice_name_index = 0
+
+    if saved_voice_name in friendly_names:
+        saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
+    else:
+        # 如果没有保存的设置，选择与UI语言匹配的第一个语音
+        for i, v in enumerate(voices):
+            if (v.lower().startswith(st.session_state["ui_language"].lower())
+                    and "V2" not in v):
+                saved_voice_name_index = i
+                break
+
+    # 语音选择下拉框
+    selected_friendly_name = st.selectbox(
+        tr("Speech Synthesis"),
+        options=list(friendly_names.values()),
+        index=saved_voice_name_index,
+    )
+
+    # 获取实际的语音名称
+    voice_name = list(friendly_names.keys())[
+        list(friendly_names.values()).index(selected_friendly_name)
+    ]
+
+    # 保存设置
+    config.ui["voice_name"] = voice_name
+
+    # Azure V2语音特殊处理
+    if voice.is_azure_v2_voice(voice_name):
+        render_azure_v2_settings(tr)
+
+    # 语音参数设置
+    render_voice_parameters(tr)
+
+    # 试听按钮
+    render_voice_preview(tr, voice_name)
+
+
+def render_azure_v2_settings(tr):
+    """渲染Azure V2语音设置"""
+    saved_azure_speech_region = config.azure.get("speech_region", "")
+    saved_azure_speech_key = config.azure.get("speech_key", "")
+
+    azure_speech_region = st.text_input(
+        tr("Speech Region"),
+        value=saved_azure_speech_region
+    )
+    azure_speech_key = st.text_input(
+        tr("Speech Key"),
+        value=saved_azure_speech_key,
+        type="password"
+    )
+
+    config.azure["speech_region"] = azure_speech_region
+    config.azure["speech_key"] = azure_speech_key
+
+
+def render_voice_parameters(tr):
+    """渲染语音参数设置"""
+    # 音量
+    voice_volume = st.slider(
+        tr("Speech Volume"),
+        min_value=0.0,
+        max_value=1.0,
+        value=1.0,
+        step=0.01,
+        help=tr("Adjust the volume of the original audio")
+    )
+    st.session_state['voice_volume'] = voice_volume
+
+
+    # 语速
+    voice_rate = st.selectbox(
+        tr("Speech Rate"),
+        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+        index=2,
+    )
+    st.session_state['voice_rate'] = voice_rate
+
+    # 音调
+    voice_pitch = st.selectbox(
+        tr("Speech Pitch"),
+        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+        index=2,
+    )
+    st.session_state['voice_pitch'] = voice_pitch
+
+
+def render_voice_preview(tr, voice_name):
+    """渲染语音试听功能"""
+    if st.button(tr("Play Voice")):
+        play_content = "感谢关注 NarratoAI，有任何问题或建议，可以关注微信公众号，求助或讨论"
+        if not play_content:
+            play_content = st.session_state.get('video_script', '')
+        if not play_content:
+            play_content = tr("Voice Example")
+
+        with st.spinner(tr("Synthesizing Voice")):
+            temp_dir = utils.storage_dir("temp", create=True)
+            audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
+
+            sub_maker = voice.tts(
+                text=play_content,
+                voice_name=voice_name,
+                voice_rate=st.session_state.get('voice_rate', 1.0),
+                voice_pitch=st.session_state.get('voice_pitch', 1.0),
+                voice_file=audio_file,
+            )
+
+            # 如果语音文件生成失败，使用默认内容重试
+            if not sub_maker:
+                play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
+                sub_maker = voice.tts(
+                    text=play_content,
+                    voice_name=voice_name,
+                    voice_rate=st.session_state.get('voice_rate', 1.0),
+                    voice_pitch=st.session_state.get('voice_pitch', 1.0),
+                    voice_file=audio_file,
+                )
+
+            if sub_maker and os.path.exists(audio_file):
+                st.audio(audio_file, format="audio/mp3")
+                if os.path.exists(audio_file):
+                    os.remove(audio_file)
+
+
+def render_bgm_settings(tr):
+    """渲染背景音乐设置"""
+    # 背景音乐选项
+    bgm_options = [
+        (tr("No Background Music"), ""),
+        (tr("Random Background Music"), "random"),
+        (tr("Custom Background Music"), "custom"),
+    ]
+
+    selected_index = st.selectbox(
+        tr("Background Music"),
+        index=1,
+        options=range(len(bgm_options)),
+        format_func=lambda x: bgm_options[x][0],
+    )
+
+    # 获取选择的背景音乐类型
+    bgm_type = bgm_options[selected_index][1]
+    st.session_state['bgm_type'] = bgm_type
+
+    # 自定义背景音乐处理
+    if bgm_type == "custom":
+        custom_bgm_file = st.text_input(tr("Custom Background Music File"))
+        if custom_bgm_file and os.path.exists(custom_bgm_file):
+            st.session_state['bgm_file'] = custom_bgm_file
+
+    # 背景音乐音量
+    bgm_volume = st.slider(
+        tr("Background Music Volume"),
+        min_value=0.0,
+        max_value=1.0,
+        value=0.3,
+        step=0.01,
+        help=tr("Adjust the volume of the original audio")
+    )
+    st.session_state['bgm_volume'] = bgm_volume
+
+
+def get_audio_params():
+    """获取音频参数"""
+    return {
+        'voice_name': config.ui.get("voice_name", ""),
+        'voice_volume': st.session_state.get('voice_volume', 1.0),
+        'voice_rate': st.session_state.get('voice_rate', 1.0),
+        'voice_pitch': st.session_state.get('voice_pitch', 1.0),
+        'bgm_type': st.session_state.get('bgm_type', 'random'),
+        'bgm_file': st.session_state.get('bgm_file', ''),
+        'bgm_volume': st.session_state.get('bgm_volume', 0.3),
+    }
diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f3c625c32b3d2cf758bcaed60ecddd6bde76e7
--- /dev/null
+++ b/webui/components/basic_settings.py
@@ -0,0 +1,383 @@
+import traceback
+
+import streamlit as st
+import os
+from app.config import config
+from app.utils import utils
+from loguru import logger
+
+
+def render_basic_settings(tr):
+    """渲染基础设置面板"""
+    with st.expander(tr("Basic Settings"), expanded=False):
+        config_panels = st.columns(3)
+        left_config_panel = config_panels[0]
+        middle_config_panel = config_panels[1]
+        right_config_panel = config_panels[2]
+
+        with left_config_panel:
+            render_language_settings(tr)
+            render_proxy_settings(tr)
+
+        with middle_config_panel:
+            render_vision_llm_settings(tr)  # 视频分析模型设置
+
+        with right_config_panel:
+            render_text_llm_settings(tr)  # 文案生成模型设置
+
+
+def render_language_settings(tr):
+    st.subheader(tr("Proxy Settings"))
+
+    """渲染语言设置"""
+    system_locale = utils.get_system_locale()
+    i18n_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "i18n")
+    locales = utils.load_locales(i18n_dir)
+
+    display_languages = []
+    selected_index = 0
+    for i, code in enumerate(locales.keys()):
+        display_languages.append(f"{code} - {locales[code].get('Language')}")
+        if code == st.session_state.get('ui_language', system_locale):
+            selected_index = i
+
+    selected_language = st.selectbox(
+        tr("Language"),
+        options=display_languages,
+        index=selected_index
+    )
+
+    if selected_language:
+        code = selected_language.split(" - ")[0].strip()
+        st.session_state['ui_language'] = code
+        config.ui['language'] = code
+
+
+def render_proxy_settings(tr):
+    """渲染代理设置"""
+    # 获取当前代理状态
+    proxy_enabled = config.proxy.get("enabled", False)
+    proxy_url_http = config.proxy.get("http")
+    proxy_url_https = config.proxy.get("https")
+
+    # 添加代理开关
+    proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
+    
+    # 保存代理开关状态
+    # config.proxy["enabled"] = proxy_enabled
+
+    # 只有在代理启用时才显示代理设置输入框
+    if proxy_enabled:
+        HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http)
+        HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https)
+
+        if HTTP_PROXY and HTTPS_PROXY:
+            config.proxy["http"] = HTTP_PROXY
+            config.proxy["https"] = HTTPS_PROXY
+            os.environ["HTTP_PROXY"] = HTTP_PROXY
+            os.environ["HTTPS_PROXY"] = HTTPS_PROXY
+            # logger.debug(f"代理已启用: {HTTP_PROXY}")
+    else:
+        # 当代理被禁用时，清除环境变量和配置
+        os.environ.pop("HTTP_PROXY", None)
+        os.environ.pop("HTTPS_PROXY", None)
+        # config.proxy["http"] = ""
+        # config.proxy["https"] = ""
+
+
+def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
+    """测试视觉模型连接
+    
+    Args:
+        api_key: API密钥
+        base_url: 基础URL
+        model_name: 模型名称
+        provider: 提供商名称
+    
+    Returns:
+        bool: 连接是否成功
+        str: 测试结果消息
+    """
+    if provider.lower() == 'gemini':
+        import google.generativeai as genai
+        
+        try:
+            genai.configure(api_key=api_key)
+            model = genai.GenerativeModel(model_name)
+            model.generate_content("直接回复我文本'当前网络可用'")
+            return True, tr("gemini model is available")
+        except Exception as e:
+            return False, f"{tr('gemini model is not available')}: {str(e)}"
+    elif provider.lower() == 'narratoapi':
+        import requests
+        try:
+            # 构建测试请求
+            headers = {
+                "Authorization": f"Bearer {api_key}"
+            }
+        
+            test_url = f"{base_url.rstrip('/')}/health"
+            response = requests.get(test_url, headers=headers, timeout=10)
+        
+            if response.status_code == 200:
+                return True, tr("NarratoAPI is available")
+            else:
+                return False, f"{tr('NarratoAPI is not available')}: HTTP {response.status_code}"
+        except Exception as e:
+            return False, f"{tr('NarratoAPI is not available')}: {str(e)}"
+
+    else:
+        from openai import OpenAI
+        try:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url,
+            )
+
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": "You are a helpful assistant."}],
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
+                                },
+                            },
+                            {"type": "text", "text": "回复我网络可用即可"},
+                        ],
+                    },
+                ],
+            )
+            if response and response.choices:
+                return True, tr("QwenVL model is available")
+            else:
+                return False, tr("QwenVL model returned invalid response")
+
+        except Exception as e:
+            # logger.debug(api_key)
+            # logger.debug(base_url)
+            # logger.debug(model_name)
+            return False, f"{tr('QwenVL model is not available')}: {str(e)}"
+
+
+def render_vision_llm_settings(tr):
+    """渲染视频分析模型设置"""
+    st.subheader(tr("Vision Model Settings"))
+
+    # 视频分析模型提供商选择
+    vision_providers = ['Siliconflow', 'Gemini', 'QwenVL', 'OpenAI']
+    saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower()
+    saved_provider_index = 0
+
+    for i, provider in enumerate(vision_providers):
+        if provider.lower() == saved_vision_provider:
+            saved_provider_index = i
+            break
+
+    vision_provider = st.selectbox(
+        tr("Vision Model Provider"),
+        options=vision_providers,
+        index=saved_provider_index
+    )
+    vision_provider = vision_provider.lower()
+    config.app["vision_llm_provider"] = vision_provider
+    st.session_state['vision_llm_providers'] = vision_provider
+
+    # 获取已保存的视觉模型配置
+    vision_api_key = config.app.get(f"vision_{vision_provider}_api_key", "")
+    vision_base_url = config.app.get(f"vision_{vision_provider}_base_url", "")
+    vision_model_name = config.app.get(f"vision_{vision_provider}_model_name", "")
+
+    # 渲染视觉模型配置输入框
+    st_vision_api_key = st.text_input(tr("Vision API Key"), value=vision_api_key, type="password")
+    
+    # 根据不同提供商设置默认值和帮助信息
+    if vision_provider == 'gemini':
+        st_vision_base_url = st.text_input(
+            tr("Vision Base URL"), 
+            value=vision_base_url,
+            disabled=True,
+            help=tr("Gemini API does not require a base URL")
+        )
+        st_vision_model_name = st.text_input(
+            tr("Vision Model Name"), 
+            value=vision_model_name or "gemini-2.0-flash-lite",
+            help=tr("Default: gemini-2.0-flash-lite")
+        )
+    elif vision_provider == 'qwenvl':
+        st_vision_base_url = st.text_input(
+            tr("Vision Base URL"), 
+            value=vision_base_url,
+            help=tr("Default: https://dashscope.aliyuncs.com/compatible-mode/v1")
+        )
+        st_vision_model_name = st.text_input(
+            tr("Vision Model Name"), 
+            value=vision_model_name or "qwen-vl-max-latest",
+            help=tr("Default: qwen-vl-max-latest")
+        )
+    else:
+        st_vision_base_url = st.text_input(tr("Vision Base URL"), value=vision_base_url)
+        st_vision_model_name = st.text_input(tr("Vision Model Name"), value=vision_model_name)
+
+    # 在配置输入框后添加测试按钮
+    if st.button(tr("Test Connection"), key="test_vision_connection"):
+        with st.spinner(tr("Testing connection...")):
+            success, message = test_vision_model_connection(
+                api_key=st_vision_api_key,
+                base_url=st_vision_base_url,
+                model_name=st_vision_model_name,
+                provider=vision_provider,
+                tr=tr
+            )
+            
+            if success:
+                st.success(tr(message))
+            else:
+                st.error(tr(message))
+
+    # 保存视觉模型配置
+    if st_vision_api_key:
+        config.app[f"vision_{vision_provider}_api_key"] = st_vision_api_key
+        st.session_state[f"vision_{vision_provider}_api_key"] = st_vision_api_key
+    if st_vision_base_url:
+        config.app[f"vision_{vision_provider}_base_url"] = st_vision_base_url
+        st.session_state[f"vision_{vision_provider}_base_url"] = st_vision_base_url
+    if st_vision_model_name:
+        config.app[f"vision_{vision_provider}_model_name"] = st_vision_model_name
+        st.session_state[f"vision_{vision_provider}_model_name"] = st_vision_model_name
+
+
+def test_text_model_connection(api_key, base_url, model_name, provider, tr):
+    """测试文本模型连接
+    
+    Args:
+        api_key: API密钥
+        base_url: 基础URL
+        model_name: 模型名称
+        provider: 提供商名称
+    
+    Returns:
+        bool: 连接是否成功
+        str: 测试结果消息
+    """
+    import requests
+    
+    try:
+        # 构建统一的测试请求（遵循OpenAI格式）
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+        }
+
+        # 特殊处理Gemini
+        if provider.lower() == 'gemini':
+            import google.generativeai as genai
+            try:
+                genai.configure(api_key=api_key)
+                model = genai.GenerativeModel(model_name)
+                model.generate_content("直接回复我文本'当前网络可用'")
+                return True, tr("Gemini model is available")
+            except Exception as e:
+                return False, f"{tr('Gemini model is not available')}: {str(e)}"
+        else:
+            test_url = f"{base_url.rstrip('/')}/chat/completions"
+
+            # 构建测试消息
+            test_data = {
+                "model": model_name,
+                "messages": [
+                    {"role": "user", "content": "直接回复我文本'当前网络可用'"}
+                ],
+                "stream": False
+            }
+
+            # 发送测试请求
+            response = requests.post(
+                test_url,
+                headers=headers,
+                json=test_data,
+            )
+            # logger.debug(model_name)
+            # logger.debug(api_key)
+            # logger.debug(test_url)
+            if response.status_code == 200:
+                return True, tr("Text model is available")
+            else:
+                return False, f"{tr('Text model is not available')}: HTTP {response.status_code}"
+            
+    except Exception as e:
+        logger.error(traceback.format_exc())
+        return False, f"{tr('Connection failed')}: {str(e)}"
+
+
+def render_text_llm_settings(tr):
+    """渲染文案生成模型设置"""
+    st.subheader(tr("Text Generation Model Settings"))
+
+    # 文案生成模型提供商选择
+    text_providers = ['OpenAI', 'Siliconflow', 'DeepSeek', 'Gemini', 'Qwen', 'Moonshot']
+    saved_text_provider = config.app.get("text_llm_provider", "OpenAI").lower()
+    saved_provider_index = 0
+
+    for i, provider in enumerate(text_providers):
+        if provider.lower() == saved_text_provider:
+            saved_provider_index = i
+            break
+
+    text_provider = st.selectbox(
+        tr("Text Model Provider"),
+        options=text_providers,
+        index=saved_provider_index
+    )
+    text_provider = text_provider.lower()
+    config.app["text_llm_provider"] = text_provider
+
+    # 获取已保存的文本模型配置
+    text_api_key = config.app.get(f"text_{text_provider}_api_key")
+    text_base_url = config.app.get(f"text_{text_provider}_base_url")
+    text_model_name = config.app.get(f"text_{text_provider}_model_name")
+
+    # 渲染文本模型配置输入框
+    st_text_api_key = st.text_input(tr("Text API Key"), value=text_api_key, type="password")
+    st_text_base_url = st.text_input(tr("Text Base URL"), value=text_base_url)
+    st_text_model_name = st.text_input(tr("Text Model Name"), value=text_model_name)
+
+    # 添加测试按钮
+    if st.button(tr("Test Connection"), key="test_text_connection"):
+        with st.spinner(tr("Testing connection...")):
+            success, message = test_text_model_connection(
+                api_key=st_text_api_key,
+                base_url=st_text_base_url,
+                model_name=st_text_model_name,
+                provider=text_provider,
+                tr=tr
+            )
+            
+            if success:
+                st.success(message)
+            else:
+                st.error(message)
+
+    # 保存文本模型配置
+    if st_text_api_key:
+        config.app[f"text_{text_provider}_api_key"] = st_text_api_key
+    if st_text_base_url:
+        config.app[f"text_{text_provider}_base_url"] = st_text_base_url
+    if st_text_model_name:
+        config.app[f"text_{text_provider}_model_name"] = st_text_model_name
+
+    # # Cloudflare 特殊配置
+    # if text_provider == 'cloudflare':
+    #     st_account_id = st.text_input(
+    #         tr("Account ID"),
+    #         value=config.app.get(f"text_{text_provider}_account_id", "")
+    #     )
+    #     if st_account_id:
+    #         config.app[f"text_{text_provider}_account_id"] = st_account_id
diff --git a/webui/components/merge_settings.py b/webui/components/merge_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe35f7aa6fa25a950dcd3597da6d6afd8e173836
--- /dev/null
+++ b/webui/components/merge_settings.py
@@ -0,0 +1,296 @@
+import os
+import time
+import streamlit as st
+from loguru import logger
+from typing import List, Dict
+from dataclasses import dataclass
+from streamlit.runtime.uploaded_file_manager import UploadedFile
+
+from webui.utils.merge_video import merge_videos_and_subtitles
+from app.utils.utils import video_dir, srt_dir
+
+# 定义临时目录路径
+TEMP_MERGE_DIR = os.path.join("storage", "temp", "merge")
+
+# 确保临时目录存在
+os.makedirs(TEMP_MERGE_DIR, exist_ok=True)
+
+
+@dataclass
+class VideoSubtitlePair:
+    video_file: UploadedFile | None
+    subtitle_file: str | None
+    base_name: str
+    order: int = 0
+
+
+def save_uploaded_file(uploaded_file: UploadedFile, target_dir: str) -> str:
+    """Save uploaded file to target directory and return the file path"""
+    file_path = os.path.join(target_dir, uploaded_file.name)
+    # 如果文件已存在，先删除它
+    if os.path.exists(file_path):
+        os.remove(file_path)
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getvalue())
+    return file_path
+
+
+def clean_temp_dir():
+    """清空临时目录"""
+    if os.path.exists(TEMP_MERGE_DIR):
+        for file in os.listdir(TEMP_MERGE_DIR):
+            file_path = os.path.join(TEMP_MERGE_DIR, file)
+            try:
+                if os.path.isfile(file_path):
+                    os.unlink(file_path)
+            except Exception as e:
+                logger.error(f"清理临时文件失败: {str(e)}")
+
+
+def group_files(files: List[UploadedFile]) -> Dict[str, VideoSubtitlePair]:
+    """Group uploaded files by their base names"""
+    pairs = {}
+    order_counter = 0
+    
+    # 首先处理所有视频文件
+    for file in files:
+        base_name = os.path.splitext(file.name)[0]
+        ext = os.path.splitext(file.name)[1].lower()
+        
+        if ext == ".mp4":
+            if base_name not in pairs:
+                pairs[base_name] = VideoSubtitlePair(None, None, base_name, order_counter)
+                order_counter += 1
+            pairs[base_name].video_file = file
+            # 保存视频文件到临时目录
+            video_path = save_uploaded_file(file, TEMP_MERGE_DIR)
+    
+    # 然后处理所有字幕文件
+    for file in files:
+        base_name = os.path.splitext(file.name)[0]
+        ext = os.path.splitext(file.name)[1].lower()
+        
+        if ext == ".srt":
+            # 即使没有对应视频也保存字幕文件
+            subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
+            save_uploaded_file(file, TEMP_MERGE_DIR)
+            
+            if base_name in pairs:  # 如果有对应的视频
+                pairs[base_name].subtitle_file = subtitle_path
+            
+    return pairs
+
+
+def render_merge_settings(tr):
+    """Render the merge settings section"""
+    with st.expander(tr("Video Subtitle Merge"), expanded=False):
+        # 上传文件区域
+        uploaded_files = st.file_uploader(
+            tr("Upload Video and Subtitle Files"),
+            type=["mp4", "srt"],
+            accept_multiple_files=True,
+            key="merge_files"
+        )
+        
+        if uploaded_files:
+            all_pairs = group_files(uploaded_files)
+            
+            if all_pairs:
+                st.write(tr("All Uploaded Files"))
+                
+                # 初始化或更新session state中的排序信息
+                if 'file_orders' not in st.session_state:
+                    st.session_state.file_orders = {
+                        name: pair.order for name, pair in all_pairs.items()
+                    }
+                    st.session_state.needs_reorder = False
+                
+                # 确保所有新文件都有排序值
+                for name, pair in all_pairs.items():
+                    if name not in st.session_state.file_orders:
+                        st.session_state.file_orders[name] = pair.order
+                
+                # 移除不存在的文件的排序值
+                st.session_state.file_orders = {
+                    k: v for k, v in st.session_state.file_orders.items() 
+                    if k in all_pairs
+                }
+                
+                # 按照排序值对文件对进行排序
+                sorted_pairs = sorted(
+                    all_pairs.items(),
+                    key=lambda x: st.session_state.file_orders[x[0]]
+                )
+                
+                # 计算需要多少行来显示所有视频（每行5个）
+                num_pairs = len(sorted_pairs)
+                num_rows = (num_pairs + 4) // 5  # 向上取整,每行5个
+                
+                # 遍历每一行
+                for row in range(num_rows):
+                    # 创建5列
+                    cols = st.columns(5)
+                    
+                    # 在这一行中填充视频（最多5个）
+                    for col_idx in range(5):
+                        pair_idx = row * 5 + col_idx
+                        if pair_idx < num_pairs:
+                            base_name, pair = sorted_pairs[pair_idx]
+                            with cols[col_idx]:
+                                st.caption(base_name)
+                                
+                                # 显示视频预览（如果存在）
+                                video_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4")
+                                if os.path.exists(video_path):
+                                    st.video(video_path)
+                                else:
+                                    st.warning(tr("Missing Video"))
+                                
+                                # 显示字幕预览（如果存在）
+                                subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
+                                if os.path.exists(subtitle_path):
+                                    with open(subtitle_path, 'r', encoding='utf-8') as f:
+                                        subtitle_content = f.read()
+                                        st.markdown(tr("Subtitle Preview"))
+                                        st.text_area(
+                                            "Subtitle Content",
+                                            value=subtitle_content,
+                                            height=100,  # 减高度以适应5列布局
+                                            label_visibility="collapsed",
+                                            key=f"subtitle_preview_{base_name}"
+                                        )
+                                else:
+                                    st.warning(tr("Missing Subtitle"))
+                                    # 如果有视频但没有字幕，显示一键转录按钮
+                                    # if os.path.exists(video_path):
+                                    #     if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"):
+                                            # with st.spinner(tr("Transcribing...")):
+                                            #     try:
+                                            #         # 生成字幕文件
+                                            #         result = extract_audio_and_create_subtitle(video_path, subtitle_path)
+                                            #         if result:
+                                            #             # 读取生成的字幕文件内容并显示预览
+                                            #             with open(subtitle_path, 'r', encoding='utf-8') as f:
+                                            #                 subtitle_content = f.read()
+                                            #                 st.markdown(tr("Subtitle Preview"))
+                                            #                 st.text_area(
+                                            #                     "Subtitle Content",
+                                            #                     value=subtitle_content,
+                                            #                     height=150,
+                                            #                     label_visibility="collapsed",
+                                            #                     key=f"subtitle_preview_transcribed_{base_name}"
+                                            #                 )
+                                            #                 st.success(tr("Transcription Complete!"))
+                                            #                 # 更新pair的字幕文件路径
+                                            #                 pair.subtitle_file = subtitle_path
+                                            #         else:
+                                            #             st.error(tr("Transcription Failed. Please try again."))
+                                            #     except Exception as e:
+                                            #         error_message = str(e)
+                                            #         logger.error(traceback.format_exc())
+                                            #         if "rate limit exceeded" in error_message.lower():
+                                            #             st.error(tr("API rate limit exceeded. Please wait about an hour and try again."))
+                                            #         elif "resource_exhausted" in error_message.lower():
+                                            #             st.error(tr("Resources exhausted. Please try again later."))
+                                            #         else:
+                                            #             st.error(f"{tr('Transcription Failed')}: {str(e)}")
+                                
+                                # 排序输入框
+                                order = st.number_input(
+                                    tr("Order"),
+                                    min_value=0,
+                                    value=st.session_state.file_orders[base_name],
+                                    key=f"order_{base_name}",
+                                    on_change=lambda: setattr(st.session_state, 'needs_reorder', True)
+                                )
+                                if order != st.session_state.file_orders[base_name]:
+                                    st.session_state.file_orders[base_name] = order
+                                    st.session_state.needs_reorder = True
+                
+                # 如果需要重新排序，重新加载页面
+                if st.session_state.needs_reorder:
+                    st.session_state.needs_reorder = False
+                    st.rerun()
+                
+                # 找出有完整视频和字幕的文件对
+                complete_pairs = {
+                    k: v for k, v in all_pairs.items()
+                    if os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.mp4")) and 
+                    os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.srt"))
+                }
+                
+                # 合并按钮和结果显示
+                cols = st.columns([1, 2, 1])
+                with cols[0]:
+                    st.write(f"{tr('Mergeable Files')}: {len(complete_pairs)}")
+                
+                merge_videos_result = None
+                
+                with cols[1]:
+                    if st.button(tr("Merge All Files"), type="primary", use_container_width=True):
+                        try:
+                            # 获取排序后的完整文件对
+                            sorted_complete_pairs = sorted(
+                                [(k, v) for k, v in complete_pairs.items()],
+                                key=lambda x: st.session_state.file_orders[x[0]]
+                            )
+                            
+                            video_paths = []
+                            subtitle_paths = []
+                            for base_name, _ in sorted_complete_pairs:
+                                video_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4"))
+                                subtitle_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt"))
+                            
+                            # 获取输出文件路径
+                            output_video = os.path.join(video_dir(), f"merged_video_{time.strftime('%M%S')}.mp4")
+                            output_subtitle = os.path.join(srt_dir(), f"merged_subtitle_{time.strftime('%M%S')}.srt")
+                            
+                            with st.spinner(tr("Merging files...")):
+                                # 合并文件
+                                merge_videos_and_subtitles(
+                                    video_paths,
+                                    subtitle_paths,
+                                    output_video,
+                                    output_subtitle
+                                )
+                                
+                                success = True
+                                error_msg = ""
+                                
+                                # 检查输出文件是否成功生成
+                                if not os.path.exists(output_video):
+                                    success = False
+                                    error_msg += tr("Failed to generate merged video. ")
+                                if not os.path.exists(output_subtitle):
+                                    success = False
+                                    error_msg += tr("Failed to generate merged subtitle. ")
+                                
+                                if success:
+                                    # 显示成功消息
+                                    st.success(tr("Merge completed!"))
+                                    merge_videos_result = (output_video, output_subtitle)
+                                    # 清理临时目录
+                                    clean_temp_dir()
+                                else:
+                                    st.error(error_msg)
+                                    
+                        except Exception as e:
+                            error_message = str(e)
+                            if "moviepy" in error_message.lower():
+                                st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
+                            # elif "pysrt" in error_message.lower():
+                            #     st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
+                            else:
+                                st.error(f"{tr('Error during merge')}: {error_message}")
+                
+                # 合并结果预览放在合并按钮下方
+                if merge_videos_result:
+                    st.markdown(f"<h3 style='text-align: center'>{tr('Merge Result Preview')}</h3>", unsafe_allow_html=True)
+                    # 使用列布局使视频居中
+                    col1, col2, col3 = st.columns([1,2,1])
+                    with col2:
+                        st.video(merge_videos_result[0])
+                        st.code(f"{tr('Video Path')}: {merge_videos_result[0]}")
+                        st.code(f"{tr('Subtitle Path')}: {merge_videos_result[1]}")
+            else:
+                st.warning(tr("No Files Found"))
diff --git a/webui/components/review_settings.py b/webui/components/review_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f3bce479c511dd49386f8ae68f2c797b63ad0d
--- /dev/null
+++ b/webui/components/review_settings.py
@@ -0,0 +1,88 @@
+import streamlit as st
+import os
+from loguru import logger
+
+
+def render_review_panel(tr):
+    """渲染视频审查面板"""
+    with st.expander(tr("Video Check"), expanded=False):
+        try:
+            video_list = st.session_state.get('video_clip_json', [])
+            subclip_videos = st.session_state.get('subclip_videos', {})
+        except KeyError:
+            video_list = []
+            subclip_videos = {}
+
+        # 计算列数和行数
+        num_videos = len(video_list)
+        cols_per_row = 3
+        rows = (num_videos + cols_per_row - 1) // cols_per_row  # 向上取整计算行数
+
+        # 使用容器展示视频
+        for row in range(rows):
+            cols = st.columns(cols_per_row)
+            for col in range(cols_per_row):
+                index = row * cols_per_row + col
+                if index < num_videos:
+                    with cols[col]:
+                        render_video_item(tr, video_list, subclip_videos, index)
+
+
+def render_video_item(tr, video_list, subclip_videos, index):
+    """渲染单个视频项"""
+    video_script = video_list[index]
+
+    # 显示时间戳
+    timestamp = video_script.get('_id', '')
+    st.text_area(
+        tr("Timestamp"),
+        value=timestamp,
+        height=70,
+        disabled=True,
+        key=f"timestamp_{index}"
+    )
+
+    # 显示视频播放器
+    video_path = subclip_videos.get(timestamp)
+    if video_path and os.path.exists(video_path):
+        try:
+            st.video(video_path)
+        except Exception as e:
+            logger.error(f"加载视频失败 {video_path}: {e}")
+            st.error(f"无法加载视频: {os.path.basename(video_path)}")
+    else:
+        st.warning(tr("视频文件未找到"))
+
+    # 显示画面描述
+    st.text_area(
+        tr("Picture Description"),
+        value=video_script.get('picture', ''),
+        height=150,
+        disabled=True,
+        key=f"picture_{index}"
+    )
+
+    # 显示旁白文本
+    narration = st.text_area(
+        tr("Narration"),
+        value=video_script.get('narration', ''),
+        height=150,
+        key=f"narration_{index}"
+    )
+    # 保存修改后的旁白文本
+    if narration != video_script.get('narration', ''):
+        video_script['narration'] = narration
+        st.session_state['video_clip_json'] = video_list
+
+    # 显示剪辑模式
+    ost = st.selectbox(
+        tr("Clip Mode"),
+        options=range(0, 3),
+        index=video_script.get('OST', 0),
+        key=f"ost_{index}",
+        help=tr("0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio")
+    )
+    # 保存修改后的剪辑模式
+    if ost != video_script.get('OST', 0):
+        video_script['OST'] = ost
+        st.session_state['video_clip_json'] = video_list
diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..b452d08c6cfbf2fc886ef6a0bfb8020ce12a34d2
--- /dev/null
+++ b/webui/components/script_settings.py
@@ -0,0 +1,447 @@
+import os
+import glob
+import json
+import time
+import traceback
+import streamlit as st
+from loguru import logger
+
+from app.config import config
+from app.models.schema import VideoClipParams
+from app.utils import utils, check_script
+from webui.tools.generate_script_docu import generate_script_docu
+from webui.tools.generate_script_short import generate_script_short
+from webui.tools.generate_short_summary import generate_script_short_sunmmary
+
+
+def render_script_panel(tr):
+    """渲染脚本配置面板"""
+    with st.container(border=True):
+        st.write(tr("Video Script Configuration"))
+        params = VideoClipParams()
+
+        # 渲染脚本文件选择
+        render_script_file(tr, params)
+
+        # 渲染视频文件选择
+        render_video_file(tr, params)
+
+        # 获取当前选择的脚本类型
+        script_path = st.session_state.get('video_clip_json_path', '')
+
+        # 根据脚本类型显示不同的布局
+        if script_path == "auto":
+            # 画面解说
+            render_video_details(tr)
+        elif script_path == "short":
+            # 短剧混剪
+            render_short_generate_options(tr)
+        elif script_path == "summary":
+            # 短剧解说
+            short_drama_summary(tr)
+        else:
+            # 默认为空
+            pass
+
+        # 渲染脚本操作按钮
+        render_script_buttons(tr, params)
+
+
+def render_script_file(tr, params):
+    """渲染脚本文件选择"""
+    script_list = [
+        (tr("None"), ""),
+        (tr("Auto Generate"), "auto"),
+        (tr("Short Generate"), "short"),
+        (tr("Short Drama Summary"), "summary"),
+        (tr("Upload Script"), "upload_script")
+    ]
+
+    # 获取已有脚本文件
+    suffix = "*.json"
+    script_dir = utils.script_dir()
+    files = glob.glob(os.path.join(script_dir, suffix))
+    file_list = []
+
+    for file in files:
+        file_list.append({
+            "name": os.path.basename(file),
+            "file": file,
+            "ctime": os.path.getctime(file)
+        })
+
+    file_list.sort(key=lambda x: x["ctime"], reverse=True)
+    for file in file_list:
+        display_name = file['file'].replace(config.root_dir, "")
+        script_list.append((display_name, file['file']))
+
+    # 找到保存的脚本文件在列表中的索引
+    saved_script_path = st.session_state.get('video_clip_json_path', '')
+    selected_index = 0
+    for i, (_, path) in enumerate(script_list):
+        if path == saved_script_path:
+            selected_index = i
+            break
+
+    selected_script_index = st.selectbox(
+        tr("Script Files"),
+        index=selected_index,
+        options=range(len(script_list)),
+        format_func=lambda x: script_list[x][0]
+    )
+
+    script_path = script_list[selected_script_index][1]
+    st.session_state['video_clip_json_path'] = script_path
+    params.video_clip_json_path = script_path
+
+    # 处理脚本上传
+    if script_path == "upload_script":
+        uploaded_file = st.file_uploader(
+            tr("Upload Script File"),
+            type=["json"],
+            accept_multiple_files=False,
+        )
+
+        if uploaded_file is not None:
+            try:
+                # 读取上传的JSON内容并验证格式
+                script_content = uploaded_file.read().decode('utf-8')
+                json_data = json.loads(script_content)
+
+                # 保存到脚本目录
+                script_file_path = os.path.join(script_dir, uploaded_file.name)
+                file_name, file_extension = os.path.splitext(uploaded_file.name)
+
+                # 如果文件已存在,添加时间戳
+                if os.path.exists(script_file_path):
+                    timestamp = time.strftime("%Y%m%d%H%M%S")
+                    file_name_with_timestamp = f"{file_name}_{timestamp}"
+                    script_file_path = os.path.join(script_dir, file_name_with_timestamp + file_extension)
+
+                # 写入文件
+                with open(script_file_path, "w", encoding='utf-8') as f:
+                    json.dump(json_data, f, ensure_ascii=False, indent=2)
+
+                # 更新状态
+                st.success(tr("Script Uploaded Successfully"))
+                st.session_state['video_clip_json_path'] = script_file_path
+                params.video_clip_json_path = script_file_path
+                time.sleep(1)
+                st.rerun()
+
+            except json.JSONDecodeError:
+                st.error(tr("Invalid JSON format"))
+            except Exception as e:
+                st.error(f"{tr('Upload failed')}: {str(e)}")
+
+
+def render_video_file(tr, params):
+    """渲染视频文件选择"""
+    video_list = [(tr("None"), ""), (tr("Upload Local Files"), "upload_local")]
+
+    # 获取已有视频文件
+    for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]:
+        video_files = glob.glob(os.path.join(utils.video_dir(), suffix))
+        for file in video_files:
+            display_name = file.replace(config.root_dir, "")
+            video_list.append((display_name, file))
+
+    selected_video_index = st.selectbox(
+        tr("Video File"),
+        index=0,
+        options=range(len(video_list)),
+        format_func=lambda x: video_list[x][0]
+    )
+
+    video_path = video_list[selected_video_index][1]
+    st.session_state['video_origin_path'] = video_path
+    params.video_origin_path = video_path
+
+    if video_path == "upload_local":
+        uploaded_file = st.file_uploader(
+            tr("Upload Local Files"),
+            type=["mp4", "mov", "avi", "flv", "mkv"],
+            accept_multiple_files=False,
+        )
+
+        if uploaded_file is not None:
+            video_file_path = os.path.join(utils.video_dir(), uploaded_file.name)
+            file_name, file_extension = os.path.splitext(uploaded_file.name)
+
+            if os.path.exists(video_file_path):
+                timestamp = time.strftime("%Y%m%d%H%M%S")
+                file_name_with_timestamp = f"{file_name}_{timestamp}"
+                video_file_path = os.path.join(utils.video_dir(), file_name_with_timestamp + file_extension)
+
+            with open(video_file_path, "wb") as f:
+                f.write(uploaded_file.read())
+                st.success(tr("File Uploaded Successfully"))
+                st.session_state['video_origin_path'] = video_file_path
+                params.video_origin_path = video_file_path
+                time.sleep(1)
+                st.rerun()
+
+
+def render_short_generate_options(tr):
+    """
+    渲染Short Generate模式下的特殊选项
+    在Short Generate模式下，替换原有的输入框为自定义片段选项
+    """
+    short_drama_summary(tr)
+    # 显示自定义片段数量选择器
+    custom_clips = st.number_input(
+        tr("自定义片段"),
+        min_value=1,
+        max_value=20,
+        value=st.session_state.get('custom_clips', 5),
+        help=tr("设置需要生成的短视频片段数量"),
+        key="custom_clips_input"
+    )
+    st.session_state['custom_clips'] = custom_clips
+
+
+def render_video_details(tr):
+    """画面解说 渲染视频主题和提示词"""
+    video_theme = st.text_input(tr("Video Theme"))
+    custom_prompt = st.text_area(
+        tr("Generation Prompt"),
+        value=st.session_state.get('video_plot', ''),
+        help=tr("Custom prompt for LLM, leave empty to use default prompt"),
+        height=180
+    )
+    # 非短视频模式下显示原有的三个输入框
+    input_cols = st.columns(2)
+
+    with input_cols[0]:
+        st.number_input(
+            tr("Frame Interval (seconds)"),
+            min_value=0,
+            value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)),
+            help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"),
+            key="frame_interval_input"
+        )
+
+    with input_cols[1]:
+        st.number_input(
+            tr("Batch Size"),
+            min_value=0,
+            value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)),
+            help=tr("Batch Size (More keyframes consume more tokens)"),
+            key="vision_batch_size"
+        )
+    st.session_state['video_theme'] = video_theme
+    st.session_state['custom_prompt'] = custom_prompt
+    return video_theme, custom_prompt
+
+
+def short_drama_summary(tr):
+    """短剧解说 渲染视频主题和提示词"""
+    # 检查是否已经处理过字幕文件
+    if 'subtitle_file_processed' not in st.session_state:
+        st.session_state['subtitle_file_processed'] = False
+    
+    subtitle_file = st.file_uploader(
+        tr("上传字幕文件"),
+        type=["srt"],
+        accept_multiple_files=False,
+        key="subtitle_file_uploader"  # 添加唯一key
+    )
+    
+    # 显示当前已上传的字幕文件路径
+    if 'subtitle_path' in st.session_state and st.session_state['subtitle_path']:
+        st.info(f"已上传字幕: {os.path.basename(st.session_state['subtitle_path'])}")
+        if st.button(tr("清除已上传字幕")):
+            st.session_state['subtitle_path'] = None
+            st.session_state['subtitle_file_processed'] = False
+            st.rerun()
+    
+    # 只有当有文件上传且尚未处理时才执行处理逻辑
+    if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
+        try:
+            # 读取上传的SRT内容
+            script_content = subtitle_file.read().decode('utf-8')
+
+            # 保存到字幕目录
+            script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
+            file_name, file_extension = os.path.splitext(subtitle_file.name)
+
+            # 如果文件已存在,添加时间戳
+            if os.path.exists(script_file_path):
+                timestamp = time.strftime("%Y%m%d%H%M%S")
+                file_name_with_timestamp = f"{file_name}_{timestamp}"
+                script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
+
+            # 直接写入SRT内容，不进行JSON转换
+            with open(script_file_path, "w", encoding='utf-8') as f:
+                f.write(script_content)
+
+            # 更新状态
+            st.success(tr("字幕上传成功"))
+            st.session_state['subtitle_path'] = script_file_path
+            st.session_state['subtitle_file_processed'] = True  # 标记已处理
+            
+            # 避免使用rerun，使用更新状态的方式
+            # st.rerun()
+            
+        except Exception as e:
+            st.error(f"{tr('Upload failed')}: {str(e)}")
+
+    # 名称输入框
+    video_theme = st.text_input(tr("短剧名称"))
+    st.session_state['video_theme'] = video_theme
+    # 数字输入框
+    temperature = st.slider("temperature", 0.0, 2.0, 0.7)
+    st.session_state['temperature'] = temperature
+    return video_theme
+
+
+def render_script_buttons(tr, params):
+    """渲染脚本操作按钮"""
+    # 获取当前选择的脚本类型
+    script_path = st.session_state.get('video_clip_json_path', '')
+
+    # 生成/加载按钮
+    if script_path == "auto":
+        button_name = tr("Generate Video Script")
+    elif script_path == "short":
+        button_name = tr("Generate Short Video Script")
+    elif script_path == "summary":
+        button_name = tr("生成短剧解说脚本")
+    elif script_path.endswith("json"):
+        button_name = tr("Load Video Script")
+    else:
+        button_name = tr("Please Select Script File")
+
+    if st.button(button_name, key="script_action", disabled=not script_path):
+        if script_path == "auto":
+            # 执行纪录片视频脚本生成（视频无字幕无配音）
+            generate_script_docu(params)
+        elif script_path == "short":
+            # 执行 短剧混剪 脚本生成
+            custom_clips = st.session_state.get('custom_clips')
+            generate_script_short(tr, params, custom_clips)
+        elif script_path == "summary":
+            # 执行 短剧解说 脚本生成
+            subtitle_path = st.session_state.get('subtitle_path')
+            video_theme = st.session_state.get('video_theme')
+            temperature = st.session_state.get('temperature')
+            generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature)
+        else:
+            load_script(tr, script_path)
+
+    # 视频脚本编辑区
+    video_clip_json_details = st.text_area(
+        tr("Video Script"),
+        value=json.dumps(st.session_state.get('video_clip_json', []), indent=2, ensure_ascii=False),
+        height=180
+    )
+
+    # 操作按钮行
+    button_cols = st.columns(3)
+    with button_cols[0]:
+        if st.button(tr("Check Format"), key="check_format", use_container_width=True):
+            check_script_format(tr, video_clip_json_details)
+
+    with button_cols[1]:
+        if st.button(tr("Save Script"), key="save_script", use_container_width=True):
+            save_script(tr, video_clip_json_details)
+
+    with button_cols[2]:
+        script_valid = st.session_state.get('script_format_valid', False)
+        if st.button(tr("Crop Video"), key="crop_video", disabled=not script_valid, use_container_width=True):
+            crop_video(tr, params)
+
+
+def check_script_format(tr, script_content):
+    """检查脚本格式"""
+    try:
+        result = check_script.check_format(script_content)
+        if result.get('success'):
+            st.success(tr("Script format check passed"))
+            st.session_state['script_format_valid'] = True
+        else:
+            st.error(f"{tr('Script format check failed')}: {result.get('message')}")
+            st.session_state['script_format_valid'] = False
+    except Exception as e:
+        st.error(f"{tr('Script format check error')}: {str(e)}")
+        st.session_state['script_format_valid'] = False
+
+
+def load_script(tr, script_path):
+    """加载脚本文件"""
+    try:
+        with open(script_path, 'r', encoding='utf-8') as f:
+            script = f.read()
+            script = utils.clean_model_output(script)
+            st.session_state['video_clip_json'] = json.loads(script)
+            st.success(tr("Script loaded successfully"))
+            st.rerun()
+    except Exception as e:
+        logger.error(f"加载脚本文件时发生错误\n{traceback.format_exc()}")
+        st.error(f"{tr('Failed to load script')}: {str(e)}")
+
+
+def save_script(tr, video_clip_json_details):
+    """保存视频脚本"""
+    if not video_clip_json_details:
+        st.error(tr("请输入视频脚本"))
+        st.stop()
+
+    with st.spinner(tr("Save Script")):
+        script_dir = utils.script_dir()
+        timestamp = time.strftime("%Y-%m%d-%H%M%S")
+        save_path = os.path.join(script_dir, f"{timestamp}.json")
+
+        try:
+            data = json.loads(video_clip_json_details)
+            with open(save_path, 'w', encoding='utf-8') as file:
+                json.dump(data, file, ensure_ascii=False, indent=4)
+                st.session_state['video_clip_json'] = data
+                st.session_state['video_clip_json_path'] = save_path
+
+                # 更新配置
+                config.app["video_clip_json_path"] = save_path
+
+                # 显示成功消息
+                st.success(tr("Script saved successfully"))
+
+                # 强制重新加载页面更新选择框
+                time.sleep(0.5)  # 给一点时间让用户看到成功消息
+                st.rerun()
+
+        except Exception as err:
+            st.error(f"{tr('Failed to save script')}: {str(err)}")
+            st.stop()
+
+
+def crop_video(tr, params):
+    """裁剪视频"""
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress):
+        progress_bar.progress(progress)
+        status_text.text(f"剪辑进度: {progress}%")
+
+    try:
+        utils.cut_video(params, update_progress)
+        time.sleep(0.5)
+        progress_bar.progress(100)
+        st.success("视频剪辑成功完成！")
+    except Exception as e:
+        st.error(f"剪辑过程中发生错误: {str(e)}")
+    finally:
+        time.sleep(1)
+        progress_bar.empty()
+        status_text.empty()
+
+
+def get_script_params():
+    """获取脚本参数"""
+    return {
+        'video_language': st.session_state.get('video_language', ''),
+        'video_clip_json_path': st.session_state.get('video_clip_json_path', ''),
+        'video_origin_path': st.session_state.get('video_origin_path', ''),
+        'video_name': st.session_state.get('video_name', ''),
+        'video_plot': st.session_state.get('video_plot', '')
+    }
diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee27985fa6f192f35d163dddb712161928d2acb1
--- /dev/null
+++ b/webui/components/subtitle_settings.py
@@ -0,0 +1,134 @@
+import streamlit as st
+from app.config import config
+from webui.utils.cache import get_fonts_cache
+import os
+
+
+def render_subtitle_panel(tr):
+    """渲染字幕设置面板"""
+    with st.container(border=True):
+        st.write(tr("Subtitle Settings"))
+
+        # 启用字幕选项
+        enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
+        st.session_state['subtitle_enabled'] = enable_subtitles
+
+        if enable_subtitles:
+            render_font_settings(tr)
+            render_position_settings(tr)
+            render_style_settings(tr)
+
+
+def render_font_settings(tr):
+    """渲染字体设置"""
+    # 获取字体列表
+    font_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "resource", "fonts")
+    font_names = get_fonts_cache(font_dir)
+
+    # 获取保存的字体设置
+    saved_font_name = config.ui.get("font_name", "")
+    saved_font_name_index = 0
+    if saved_font_name in font_names:
+        saved_font_name_index = font_names.index(saved_font_name)
+
+    # 字体选择
+    font_name = st.selectbox(
+        tr("Font"),
+        options=font_names,
+        index=saved_font_name_index
+    )
+    config.ui["font_name"] = font_name
+    st.session_state['font_name'] = font_name
+
+    # 字体大小 和 字幕大小
+    font_cols = st.columns([0.3, 0.7])
+    with font_cols[0]:
+        saved_text_fore_color = config.ui.get("text_fore_color", "#FFFFFF")
+        text_fore_color = st.color_picker(
+            tr("Font Color"),
+            saved_text_fore_color
+        )
+        config.ui["text_fore_color"] = text_fore_color
+        st.session_state['text_fore_color'] = text_fore_color
+
+    with font_cols[1]:
+        saved_font_size = config.ui.get("font_size", 60)
+        font_size = st.slider(
+            tr("Font Size"),
+            min_value=20,
+            max_value=100,
+            value=saved_font_size
+        )
+        config.ui["font_size"] = font_size
+        st.session_state['font_size'] = font_size
+
+
+def render_position_settings(tr):
+    """渲染位置设置"""
+    subtitle_positions = [
+        (tr("Top"), "top"),
+        (tr("Center"), "center"),
+        (tr("Bottom"), "bottom"),
+        (tr("Custom"), "custom"),
+    ]
+
+    selected_index = st.selectbox(
+        tr("Position"),
+        index=2,
+        options=range(len(subtitle_positions)),
+        format_func=lambda x: subtitle_positions[x][0],
+    )
+
+    subtitle_position = subtitle_positions[selected_index][1]
+    st.session_state['subtitle_position'] = subtitle_position
+
+    # 自定义位置处理
+    if subtitle_position == "custom":
+        custom_position = st.text_input(
+            tr("Custom Position (% from top)"),
+            value="70.0"
+        )
+        try:
+            custom_position_value = float(custom_position)
+            if custom_position_value < 0 or custom_position_value > 100:
+                st.error(tr("Please enter a value between 0 and 100"))
+            else:
+                st.session_state['custom_position'] = custom_position_value
+        except ValueError:
+            st.error(tr("Please enter a valid number"))
+
+
+def render_style_settings(tr):
+    """渲染样式设置"""
+    stroke_cols = st.columns([0.3, 0.7])
+
+    with stroke_cols[0]:
+        stroke_color = st.color_picker(
+            tr("Stroke Color"),
+            value="#000000"
+        )
+        st.session_state['stroke_color'] = stroke_color
+
+    with stroke_cols[1]:
+        stroke_width = st.slider(
+            tr("Stroke Width"),
+            min_value=0.0,
+            max_value=10.0,
+            value=1.0,
+            step=0.01
+        )
+        st.session_state['stroke_width'] = stroke_width
+
+
+def get_subtitle_params():
+    """获取字幕参数"""
+    return {
+        'subtitle_enabled': st.session_state.get('subtitle_enabled', True),
+        'font_name': st.session_state.get('font_name', ''),
+        'font_size': st.session_state.get('font_size', 60),
+        'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'),
+        'subtitle_position': st.session_state.get('subtitle_position', 'bottom'),
+        'custom_position': st.session_state.get('custom_position', 70.0),
+        'stroke_color': st.session_state.get('stroke_color', '#000000'),
+        'stroke_width': st.session_state.get('stroke_width', 1.5),
+    }
diff --git a/webui/components/system_settings.py b/webui/components/system_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e95928e1235e48041dbc08023a487a55d2eb2d
--- /dev/null
+++ b/webui/components/system_settings.py
@@ -0,0 +1,45 @@
+import streamlit as st
+import os
+import shutil
+from loguru import logger
+
+from app.utils.utils import storage_dir
+
+
+def clear_directory(dir_path, tr):
+    """清理指定目录"""
+    if os.path.exists(dir_path):
+        try:
+            for item in os.listdir(dir_path):
+                item_path = os.path.join(dir_path, item)
+                try:
+                    if os.path.isfile(item_path):
+                        os.unlink(item_path)
+                    elif os.path.isdir(item_path):
+                        shutil.rmtree(item_path)
+                except Exception as e:
+                    logger.error(f"Failed to delete {item_path}: {e}")
+            st.success(tr("Directory cleared"))
+            logger.info(f"Cleared directory: {dir_path}")
+        except Exception as e:
+            st.error(f"{tr('Failed to clear directory')}: {str(e)}")
+            logger.error(f"Failed to clear directory {dir_path}: {e}")
+    else:
+        st.warning(tr("Directory does not exist"))
+
+def render_system_panel(tr):
+    """渲染系统设置面板"""
+    with st.expander(tr("System settings"), expanded=False):
+        col1, col2, col3 = st.columns(3)
+                
+        with col1:
+            if st.button(tr("Clear frames"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "temp/keyframes"), tr)
+                
+        with col2:
+            if st.button(tr("Clear clip videos"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "temp/clip_video"), tr)
+                
+        with col3:
+            if st.button(tr("Clear tasks"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "tasks"), tr)
diff --git a/webui/components/video_settings.py b/webui/components/video_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a9b3f445c6d2adf16263f59c242ce20c0f75c33
--- /dev/null
+++ b/webui/components/video_settings.py
@@ -0,0 +1,62 @@
+import streamlit as st
+from app.models.schema import VideoClipParams, VideoAspect
+
+
+def render_video_panel(tr):
+    """渲染视频配置面板"""
+    with st.container(border=True):
+        st.write(tr("Video Settings"))
+        params = VideoClipParams()
+        render_video_config(tr, params)
+
+
+def render_video_config(tr, params):
+    """渲染视频配置"""
+    # 视频比例
+    video_aspect_ratios = [
+        (tr("Portrait"), VideoAspect.portrait.value),
+        (tr("Landscape"), VideoAspect.landscape.value),
+    ]
+    selected_index = st.selectbox(
+        tr("Video Ratio"),
+        options=range(len(video_aspect_ratios)),
+        format_func=lambda x: video_aspect_ratios[x][0],
+    )
+    params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])
+    st.session_state['video_aspect'] = params.video_aspect.value
+
+    # 视频画质
+    video_qualities = [
+        ("4K (2160p)", "2160p"),
+        ("2K (1440p)", "1440p"),
+        ("Full HD (1080p)", "1080p"),
+        ("HD (720p)", "720p"),
+        ("SD (480p)", "480p"),
+    ]
+    quality_index = st.selectbox(
+        tr("Video Quality"),
+        options=range(len(video_qualities)),
+        format_func=lambda x: video_qualities[x][0],
+        index=2  # 默认选择 1080p
+    )
+    st.session_state['video_quality'] = video_qualities[quality_index][1]
+
+    # 原声音量
+    params.original_volume = st.slider(
+        tr("Original Volume"),
+        min_value=0.0,
+        max_value=1.0,
+        value=0.7,
+        step=0.01,
+        help=tr("Adjust the volume of the original audio")
+    )
+    st.session_state['original_volume'] = params.original_volume
+
+
+def get_video_params():
+    """获取视频参数"""
+    return {
+        'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value),
+        'video_quality': st.session_state.get('video_quality', '1080p'),
+        'original_volume': st.session_state.get('original_volume', 0.7)
+    }
diff --git a/webui/config/settings.py b/webui/config/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..449f7a7b797a38a8dc40c813503d771e9c862471
--- /dev/null
+++ b/webui/config/settings.py
@@ -0,0 +1,181 @@
+import os
+import tomli
+from loguru import logger
+from typing import Dict, Any, Optional
+from dataclasses import dataclass
+
+def get_version_from_file():
+    """从project_version文件中读取版本号"""
+    try:
+        version_file = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+            "project_version"
+        )
+        if os.path.isfile(version_file):
+            with open(version_file, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        return "0.1.0"  # 默认版本号
+    except Exception as e:
+        logger.error(f"读取版本号文件失败: {str(e)}")
+        return "0.1.0"  # 默认版本号
+
+@dataclass
+class WebUIConfig:
+    """WebUI配置类"""
+    # UI配置
+    ui: Dict[str, Any] = None
+    # 代理配置
+    proxy: Dict[str, str] = None
+    # 应用配置
+    app: Dict[str, Any] = None
+    # Azure配置
+    azure: Dict[str, str] = None
+    # 项目版本
+    project_version: str = get_version_from_file()
+    # 项目根目录
+    root_dir: str = None
+    # Gemini API Key
+    gemini_api_key: str = ""
+    # 每批处理的图片数量
+    vision_batch_size: int = 5
+    # 提示词
+    vision_prompt: str = """..."""
+    # Narrato API 配置
+    narrato_api_url: str = "http://127.0.0.1:8000/api/v1/video/analyze"
+    narrato_api_key: str = ""
+    narrato_batch_size: int = 10
+    narrato_vision_model: str = "gemini-1.5-flash"
+    narrato_llm_model: str = "qwen-plus"
+    
+    def __post_init__(self):
+        """初始化默认值"""
+        self.ui = self.ui or {}
+        self.proxy = self.proxy or {}
+        self.app = self.app or {}
+        self.azure = self.azure or {}
+        self.root_dir = self.root_dir or os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+def load_config(config_path: Optional[str] = None) -> WebUIConfig:
+    """加载配置文件
+    Args:
+        config_path: 配置文件路径，如果为None则使用默认路径
+    Returns:
+        WebUIConfig: 配置对象
+    """
+    try:
+        if config_path is None:
+            config_path = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                ".streamlit",
+                "webui.toml"
+            )
+        
+        # 如果配置文件不存在，使用示例配置
+        if not os.path.exists(config_path):
+            example_config = os.path.join(
+                os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+                "config.example.toml"
+            )
+            if os.path.exists(example_config):
+                config_path = example_config
+            else:
+                logger.warning(f"配置文件不存在: {config_path}")
+                return WebUIConfig()
+        
+        # 读取配置文件
+        with open(config_path, "rb") as f:
+            config_dict = tomli.load(f)
+            
+        # 创建配置对象，使用从文件读取的版本号
+        config = WebUIConfig(
+            ui=config_dict.get("ui", {}),
+            proxy=config_dict.get("proxy", {}),
+            app=config_dict.get("app", {}),
+            azure=config_dict.get("azure", {}),
+            # 不再从配置文件中获取project_version
+        )
+        
+        return config
+    
+    except Exception as e:
+        logger.error(f"加载配置文件失败: {e}")
+        return WebUIConfig()
+
+def save_config(config: WebUIConfig, config_path: Optional[str] = None) -> bool:
+    """保存配置到文件
+    Args:
+        config: 配置对象
+        config_path: 配置文件路径，如果为None则使用默认路径
+    Returns:
+        bool: 是否保存成功
+    """
+    try:
+        if config_path is None:
+            config_path = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                ".streamlit",
+                "webui.toml"
+            )
+        
+        # 确保目录存在
+        os.makedirs(os.path.dirname(config_path), exist_ok=True)
+        
+        # 转换为字典，不再保存版本号到配置文件
+        config_dict = {
+            "ui": config.ui,
+            "proxy": config.proxy,
+            "app": config.app,
+            "azure": config.azure
+            # 不再保存project_version到配置文件
+        }
+        
+        # 保存配置
+        with open(config_path, "w", encoding="utf-8") as f:
+            import tomli_w
+            tomli_w.dump(config_dict, f)
+        
+        return True
+    
+    except Exception as e:
+        logger.error(f"保存配置文件失败: {e}")
+        return False
+
+def get_config() -> WebUIConfig:
+    """获取全局配置对象
+    Returns:
+        WebUIConfig: 配置对象
+    """
+    if not hasattr(get_config, "_config"):
+        get_config._config = load_config()
+    return get_config._config
+
+def update_config(config_dict: Dict[str, Any]) -> bool:
+    """更新配置
+    Args:
+        config_dict: 配置字典
+    Returns:
+        bool: 是否更新成功
+    """
+    try:
+        config = get_config()
+        
+        # 更新配置
+        if "ui" in config_dict:
+            config.ui.update(config_dict["ui"])
+        if "proxy" in config_dict:
+            config.proxy.update(config_dict["proxy"])
+        if "app" in config_dict:
+            config.app.update(config_dict["app"])
+        if "azure" in config_dict:
+            config.azure.update(config_dict["azure"])
+        # 不再从配置字典更新project_version
+        
+        # 保存配置
+        return save_config(config)
+    
+    except Exception as e:
+        logger.error(f"更新配置失败: {e}")
+        return False
+
+# 导出全局配置对象
+config = get_config() 
\ No newline at end of file
diff --git a/webui/i18n/__init__.py b/webui/i18n/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f05c7690e57d978112c03e3856ba433ddba7dd0
--- /dev/null
+++ b/webui/i18n/__init__.py
@@ -0,0 +1 @@
+# 空文件，用于标记包 
\ No newline at end of file
diff --git a/webui/i18n/en.json b/webui/i18n/en.json
new file mode 100644
index 0000000000000000000000000000000000000000..63a2c36923d1876c6ebb40de93baa61e93cacfeb
--- /dev/null
+++ b/webui/i18n/en.json
@@ -0,0 +1,91 @@
+{
+  "Language": "English",
+  "Translation": {
+    "Video Script Configuration": "**Video Script Configuration**",
+    "Video Script Generate": "Generate Video Script",
+    "Video Subject": "Video Subject (Given a keyword, :red[AI auto-generates] video script)",
+    "Script Language": "Language of the generated video script (Usually, AI automatically outputs according to the language of the input subject)",
+    "Script Files": "Script Files",
+    "Generate Video Script and Keywords": "Click to use AI to generate **Video Script** and **Video Keywords** based on the **subject**",
+    "Auto Detect": "Auto Detect",
+    "Auto Generate": "Auto Generate",
+    "Video Script": "Video Script (:blue[①Optional, use AI to generate ②Proper punctuation helps in generating subtitles])",
+    "Save Script": "Save Script",
+    "Crop Video": "Crop Video",
+    "Video File": "Video File (:blue[1️⃣Supports uploading video files (limit 2G) 2️⃣For large files, it is recommended to directly import them into the ./resource/videos directory])",
+    "Plot Description": "Plot Description (:blue[Can be obtained from https://www.tvmao.com/])",
+    "Generate Video Keywords": "Click to use AI to generate **Video Keywords** based on the **script**",
+    "Please Enter the Video Subject": "Please enter the video script first",
+    "Generating Video Script and Keywords": "AI is generating the video script and keywords...",
+    "Generating Video Keywords": "AI is generating the video keywords...",
+    "Video Keywords": "Video Keywords (:blue[Long videos work better in conjunction with plot descriptions.])",
+    "Video Settings": "**Video Settings**",
+    "Video Concat Mode": "Video Concatenation Mode",
+    "Random": "Random Concatenation (Recommended)",
+    "Sequential": "Sequential Concatenation",
+    "Video Ratio": "Video Ratio",
+    "Portrait": "Portrait 9:16 (TikTok Video)",
+    "Landscape": "Landscape 16:9 (Xigua Video)",
+    "Clip Duration": "Maximum Clip Duration (Seconds) (**Not the total length of the video**, refers to the length of each **composite segment**)",
+    "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
+    "Audio Settings": "**Audio Settings**",
+    "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY])",
+    "Speech Region": "Service Region (:red[Required, [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key (:red[Required, either Key 1 or Key 2 is acceptable [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Speech Volume (1.0 represents 100%)",
+    "Speech Rate": "Speech Rate (1.0 represents 1x speed)",
+    "Male": "Male",
+    "Female": "Female",
+    "Background Music": "Background Music",
+    "No Background Music": "No Background Music",
+    "Random Background Music": "Random Background Music",
+    "Custom Background Music": "Custom Background Music",
+    "Custom Background Music File": "Please enter the file path of the custom background music",
+    "Background Music Volume": "Background Music Volume (0.2 represents 20%, background sound should not be too loud)",
+    "Subtitle Settings": "**Subtitle Settings**",
+    "Enable Subtitles": "Enable Subtitles (If unchecked, the following settings will not take effect)",
+    "Font": "Subtitle Font",
+    "Position": "Subtitle Position",
+    "Top": "Top",
+    "Center": "Center",
+    "Bottom": "Bottom (Recommended)",
+    "Custom": "Custom Position (70, represents 70% from the top)",
+    "Font Size": "Subtitle Size",
+    "Font Color": "Subtitle Color",
+    "Stroke Color": "Stroke Color",
+    "Stroke Width": "Stroke Width",
+    "Generate Video": "Generate Video",
+    "Video Script and Subject Cannot Both Be Empty": "Video Subject and Video Script cannot both be empty",
+    "Generating Video": "Generating video, please wait...",
+    "Start Generating Video": "Start Generating Video",
+    "Video Generation Completed": "Video Generation Completed",
+    "Video Generation Failed": "Video Generation Failed",
+    "You can download the generated video from the following links": "You can download the generated video from the following links",
+    "Basic Settings": "**Basic Settings** (:blue[Click to expand])",
+    "Language": "Interface Language",
+    "Pexels API Key": "Pexels API Key ([Click to Get](https://www.pexels.com/api/)) :red[Recommended]",
+    "Pixabay API Key": "Pixabay API Key ([Click to Get](https://pixabay.com/api/docs/#api_search_videos)) :red[Optional, if Pexels is unavailable, then choose Pixabay]",
+    "LLM Provider": "LLM Provider",
+    "API Key": "API Key (:red[Required, must be applied from the LLM provider's backend])",
+    "Base Url": "Base Url (Optional)",
+    "Account ID": "Account ID (Obtained from the URL of the Cloudflare dashboard)",
+    "Model Name": "Model Name (:blue[Confirm the authorized model name from the LLM provider's backend])",
+    "Please Enter the LLM API Key": "Please enter the **LLM API Key**",
+    "Please Enter the Pexels API Key": "Please enter the **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "Please enter the **Pixabay API Key**",
+    "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://github.com/linyqh/NarratoAI/wiki",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (Support is coming soon)",
+    "Bilibili": "Bilibili (Support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Synthesized Voice",
+    "Voice Example": "This is a sample text for testing voice synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "TTS Provider",
+    "Hide Log": "Hide Log",
+    "Upload Local Files": "Upload Local Files",
+    "File Uploaded Successfully": "File Uploaded Successfully",
+    "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)"
+  }
+}
\ No newline at end of file
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
new file mode 100644
index 0000000000000000000000000000000000000000..e028c9e0ce6646fcdbe0733930ef7984c403b7ec
--- /dev/null
+++ b/webui/i18n/zh.json
@@ -0,0 +1,201 @@
+{
+  "Language": "简体中文",
+  "Translation": {
+    "Video Script Configuration": "**视频脚本配置**",
+    "Generate Video Script": "AI生成画面解说脚本",
+    "Video Subject": "视频主题（给定一个关键词，:red[AI自动生成]视频文案）",
+    "Script Language": "生成视频脚本的语言（一般情况AI会自动根据你输入的主题语言输出）",
+    "Script Files": "脚本文件",
+    "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
+    "Auto Detect": "自动检测",
+    "Video Theme": "视频主题",
+    "Generation Prompt": "自定义提示词",
+    "Save Script": "保存脚本",
+    "Crop Video": "裁剪视频",
+    "Video File": "视频文件（:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录]）",
+    "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
+    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】",
+    "Please Enter the Video Subject": "请先填写视频文案",
+    "Generating Video Script and Keywords": "AI正在生成视频文案和关键词...",
+    "Generating Video Keywords": "AI正在生成视频关键词...",
+    "Video Keywords": "视频关键词（:blue[对于长视频配合剧情描述效果更好]）",
+    "Video Settings": "**视频设置**",
+    "Video Concat Mode": "视频拼接模式",
+    "Random": "随机拼接（推荐）",
+    "Sequential": "顺序拼接",
+    "Video Ratio": "视频比例",
+    "Portrait": "竖屏 9:16（抖音视频）",
+    "Landscape": "横屏 16:9（西瓜视频）",
+    "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
+    "Number of Videos Generated Simultaneously": "同时生成视频数量",
+    "Audio Settings": "**音频设置**",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "朗读音量（1.0表示100%）",
+    "Speech Rate": "朗读速度（1.0表示1倍速）",
+    "Male": "男性",
+    "Female": "女性",
+    "Background Music": "背景音乐",
+    "No Background Music": "无背景音乐",
+    "Random Background Music": "随机背景音乐",
+    "Custom Background Music": "自定义背景音乐",
+    "Custom Background Music File": "请输入自定义背景音乐的文件路径",
+    "Background Music Volume": "背景音乐音量（0.2表示20%，背景声音不宜过高）",
+    "Subtitle Settings": "**字幕设置**",
+    "Enable Subtitles": "启用字幕（若取消勾选，下面的设置都将不生效）",
+    "Font": "字幕字体",
+    "Position": "字幕位置",
+    "Top": "顶部",
+    "Center": "中间",
+    "Bottom": "底部（推荐）",
+    "Custom": "自定义位置（70，表示离顶部70%的位置）",
+    "Font Size": "字幕大小",
+    "Font Color": "字幕颜色",
+    "Stroke Color": "描边颜色",
+    "Stroke Width": "描边粗细",
+    "Generate Video": "生成视频",
+    "Video Script and Subject Cannot Both Be Empty": "视频主题 和 视频文案，不能同时为空",
+    "Generating Video": "正在生成视频，请稍候...",
+    "Start Generating Video": "开始生成视频",
+    "Video Generation Completed": "视频生成完成",
+    "Video Generation Failed": "视频生成失败",
+    "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
+    "Basic Settings": "**基础设置** (:blue[点击展开])",
+    "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
+    "Video LLM Provider": "视频转录大模型",
+    "LLM Provider": "大语言模型",
+    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
+    "Base Url": "Base Url (可选)",
+    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
+    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
+    "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
+    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://github.com/linyqh/NarratoAI/wiki",
+    "Video Source": "视频来源",
+    "TikTok": "抖音 (TikTok 支持中，敬请期待)",
+    "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",
+    "Xiaohongshu": "小红书 (Xiaohongshu 支持中，敬请期待)",
+    "Local file": "本地文件",
+    "Play Voice": "试听语音合成",
+    "Voice Example": "这是一段测试语音合成的示例文本",
+    "Synthesizing Voice": "语音合成中，请稍候...",
+    "TTS Provider": "语音合成提供商",
+    "Hide Log": "隐藏日志",
+    "Upload Local Files": "上传本地文件",
+    "Video Check": "视频审查",
+    "File Uploaded Successfully": "文件上传成功",
+    "timestamp": "时间戳",
+    "Picture description": "图片描述",
+    "Narration": "视频文案",
+    "Rebuild": "重新生成",
+    "Load Video Script": "加载视频脚本",
+    "Speech Pitch": "语调",
+    "Please Select Script File": "请选择脚本文件",
+    "Check Format": "脚本格式检查",
+    "Script Loaded Successfully": "脚本加载成功",
+    "Script format check passed": "脚本格式检查通过",
+    "Script format check failed": "脚本格式检查失败",
+    "Failed to Load Script": "加载脚本失败",
+    "Failed to Save Script": "保存脚本失败",
+    "Script saved successfully": "脚本保存成功",
+    "Video Script": "视频脚本",
+    "Video Quality": "视频质量",
+    "Custom prompt for LLM, leave empty to use default prompt": "自定义提示词，留空则使用默认提示词",
+    "Proxy Settings": "代理设置",
+    "HTTP_PROXY": "HTTP 代理",
+    "HTTPs_PROXY": "HTTPS 代理",
+    "Vision Model Settings": "视频分析模型设置",
+    "Vision Model Provider": "视频分析模型提供商",
+    "Vision API Key": "视频分析 API 密钥",
+    "Vision Base URL": "视频分析接口地址",
+    "Vision Model Name": "视频分析模型名称",
+    "Narrato Additional Settings": "Narrato 附加设置",
+    "Narrato API Key": "Narrato API 密钥",
+    "Narrato API URL": "Narrato API 地址",
+    "Text Generation Model Settings": "文案生成模型设置",
+    "LLM Model Name": "大语言模型名称",
+    "LLM Model API Key": "大语言模型 API 密钥",
+    "Text Model Provider": "文案生成模型提供商",
+    "Text API Key": "文案生成 API 密钥",
+    "Text Base URL": "文案生成接口地址",
+    "Text Model Name": "文案生成模型名称",
+    "Account ID": "账户 ID",
+    "Skip the first few seconds": "跳过开头多少秒",
+    "Difference threshold": "差异阈值",
+    "Vision processing batch size": "视觉处理批次大小",
+    "Test Connection": "测试连接",
+    "gemini model is available": "Gemini 模型可用",
+    "gemini model is not available": "Gemini 模型不可用",
+    "NarratoAPI is available": "NarratoAPI 可用",
+    "NarratoAPI is not available": "NarratoAPI 不可用",
+    "Unsupported provider": "不支持的提供商",
+    "0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio": "0: 仅保留音频，1: 仅保留原声，2: 保留原声和音频",
+    "Text model is not available": "文案生成模型不可用",
+    "Text model is available": "文案生成模型可用",
+    "Upload Script": "上传脚本",
+    "Upload Script File": "上传脚本文件",
+    "Script Uploaded Successfully": "脚本上传成功",
+    "Invalid JSON format": "无效的JSON格式",
+    "Upload failed": "上传失败",
+    "Video Subtitle Merge": "**合并视频与字幕**",
+    "Upload Video and Subtitle Files": "上传视频和字幕文件",
+    "Matched File Pairs": "已匹配的文件对",
+    "Merge All Files": "合并所有文件",
+    "Merge Function Not Implemented": "合并功能待实现",
+    "No Matched Pairs Found": "未找到匹配的文件对",
+    "Missing Subtitle": "缺少对应的字幕文件, 请使用其他软件完成字幕转录，比如剪映等",
+    "Missing Video": "缺少对应的视频文件",
+    "All Uploaded Files": "所有上传的文件",
+    "Order": "排序序号",
+    "Reorder": "重新排序",
+    "Merging files...": "正在合并文件...",
+    "Merge completed!": "合并完成！",
+    "Download Merged Video": "下载合并后的视频",
+    "Download Merged Subtitle": "下载合并后的字幕",
+    "Error during merge": "合并过程中出错",
+    "Failed to generate merged video.": "生成合并视频失败。",
+    "Failed to generate merged subtitle.": "生成合并字幕失败。",
+    "Error reading merged video file": "读取合并后的视频文件时出错",
+    "Error reading merged subtitle file": "读取合并后的字幕文件时出错",
+    "Error processing video files. Please check if the videos are valid MP4 files.": "处理视频文件时出错。请检查视频是否为有效的MP4文件。",
+    "Error processing subtitle files. Please check if the subtitles are valid SRT files.": "处理字幕文件时出错。请检查字幕是否为有效的SRT文件。",
+    "Preview Merged Video": "预览合并后的视频",
+    "Video Path": "视频路径",
+    "Subtitle Path": "字幕路径",
+    "Enable Proxy": "启用代理",
+    "QwenVL model is available": "QwenVL 模型可用",
+    "QwenVL model is not available": "QwenVL 模型不可用",
+    "System settings": "系统设置",
+    "Clear Cache": "清理缓存",
+    "Cache cleared": "缓存清理完成",
+    "storage directory does not exist": "storage目录不存在",
+    "Failed to clear cache": "清理缓存失败",
+    "Clear frames": "清理关键帧",
+    "Clear clip videos": "清理裁剪视频",
+    "Clear tasks": "清理任务",
+    "Directory cleared": "目录清理完成",
+    "Directory does not exist": "目录不存在",
+    "Failed to clear directory": "清理目录失败",
+    "Subtitle Preview": "字幕预览",
+    "One-Click Transcribe": "一键转录",
+    "Transcribing...": "正在转录中...",
+    "Transcription Complete!": "转录完成！",
+    "Transcription Failed. Please try again.": "转录失败，请重试。",
+    "API rate limit exceeded. Please wait about an hour and try again.": "API 调用次数已达到限制，请等待约一小时后再试。",
+    "Resources exhausted. Please try again later.": "资源已耗尽，请稍后再试。",
+    "Transcription Failed": "转录失败",
+    "Mergeable Files": "可合并文件数",
+    "Subtitle Content": "字幕内容",
+    "Merge Result Preview": "合并结果预览",
+    "Short Generate": "短剧混剪 (高燃剪辑)",
+    "Generate Short Video Script": "AI生成短剧混剪脚本",
+    "Adjust the volume of the original audio": "调整原始音频的音量",
+    "Original Volume": "视频音量",
+    "Auto Generate": "纪录片解说 (画面解说)",
+    "Frame Interval (seconds)": "帧间隔 (秒)",
+    "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)",
+    "Batch Size": "批处理大小",
+    "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多",
+    "Short Drama Summary": "短剧解说(仅支持 gemini-2.0-flash)"
+  }
+}
\ No newline at end of file
diff --git a/webui/tools/base.py b/webui/tools/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8aff6ac72cbf11a545b390c57b044f71a7ff5b2
--- /dev/null
+++ b/webui/tools/base.py
@@ -0,0 +1,161 @@
+import os
+import requests
+import streamlit as st
+from loguru import logger
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from app.config import config
+from app.utils import gemini_analyzer, qwenvl_analyzer
+
+
+def create_vision_analyzer(provider, api_key, model, base_url):
+    """
+    创建视觉分析器实例
+    
+    Args:
+        provider: 提供商名称 ('gemini' 或 'qwenvl')
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        
+    Returns:
+        VisionAnalyzer 或 QwenAnalyzer 实例
+    """
+    if provider == 'gemini':
+        return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key)
+    else:
+        # 只传入必要的参数
+        return qwenvl_analyzer.QwenAnalyzer(
+            model_name=model, 
+            api_key=api_key,
+            base_url=base_url
+        )
+
+
+def get_batch_timestamps(batch_files, prev_batch_files=None):
+    """
+    解析一批文件的时间戳范围,支持毫秒级精度
+
+    Args:
+        batch_files: 当前批次的文件列表
+        prev_batch_files: 上一个批次的文件列表,用于处理单张图片的情况
+
+    Returns:
+        tuple: (first_timestamp, last_timestamp, timestamp_range)
+        时间戳格式: HH:MM:SS,mmm (时:分:秒,毫秒)
+        例如: 00:00:50,100 表示50秒100毫秒
+
+    示例文件名格式:
+        keyframe_001253_000050100.jpg
+        其中 000050100 表示 00:00:50,100 (50秒100毫秒)
+    """
+    if not batch_files:
+        logger.warning("Empty batch files")
+        return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000"
+
+    def get_frame_files():
+        """获取首帧和尾帧文件名"""
+        if len(batch_files) == 1 and prev_batch_files and prev_batch_files:
+            # 单张图片情况:使用上一批次最后一帧作为首帧
+            first = os.path.basename(prev_batch_files[-1])
+            last = os.path.basename(batch_files[0])
+            logger.debug(f"单张图片批次,使用上一批次最后一帧作为首帧: {first}")
+        else:
+            first = os.path.basename(batch_files[0])
+            last = os.path.basename(batch_files[-1])
+        return first, last
+
+    def extract_time(filename):
+        """从文件名提取时间信息"""
+        try:
+            # 提取类似 000050100 的时间戳部分
+            time_str = filename.split('_')[2].replace('.jpg', '')
+            if len(time_str) < 9:  # 处理旧格式
+                time_str = time_str.ljust(9, '0')
+            return time_str
+        except (IndexError, AttributeError) as e:
+            logger.warning(f"Invalid filename format: {filename}, error: {e}")
+            return "000000000"
+
+    def format_timestamp(time_str):
+        """
+        将时间字符串转换为 HH:MM:SS,mmm 格式
+
+        Args:
+            time_str: 9位数字字符串,格式为 HHMMSSMMM
+                     例如: 000010000 表示 00时00分10秒000毫秒
+                          000043039 表示 00时00分43秒039毫秒
+
+        Returns:
+            str: HH:MM:SS,mmm 格式的时间戳
+        """
+        try:
+            if len(time_str) < 9:
+                logger.warning(f"Invalid timestamp format: {time_str}")
+                return "00:00:00,000"
+
+            # 从时间戳中提取时、分、秒和毫秒
+            hours = int(time_str[0:2])  # 前2位作为小时
+            minutes = int(time_str[2:4])  # 第3-4位作为分钟
+            seconds = int(time_str[4:6])  # 第5-6位作为秒数
+            milliseconds = int(time_str[6:])  # 最后3位作为毫秒
+
+            return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+        except ValueError as e:
+            logger.warning(f"时间戳格式转换失败: {time_str}, error: {e}")
+            return "00:00:00,000"
+
+    # 获取首帧和尾帧文件名
+    first_frame, last_frame = get_frame_files()
+
+    # 从文件名中提取时间信息
+    first_time = extract_time(first_frame)
+    last_time = extract_time(last_frame)
+
+    # 转换为标准时间戳格式
+    first_timestamp = format_timestamp(first_time)
+    last_timestamp = format_timestamp(last_time)
+    timestamp_range = f"{first_timestamp}-{last_timestamp}"
+
+    # logger.debug(f"解析时间戳: {first_frame} -> {first_timestamp}, {last_frame} -> {last_timestamp}")
+    return first_timestamp, last_timestamp, timestamp_range
+
+
+def get_batch_files(keyframe_files, result, batch_size=5):
+    """
+    获取当前批次的图片文件
+    """
+    batch_start = result['batch_index'] * batch_size
+    batch_end = min(batch_start + batch_size, len(keyframe_files))
+    return keyframe_files[batch_start:batch_end]
+
+
+def chekc_video_config(video_params):
+    """
+    检查视频分析配置
+    """
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json'
+    }
+    session = requests.Session()
+    retry_strategy = Retry(
+        total=3,
+        backoff_factor=1,
+        status_forcelist=[500, 502, 503, 504]
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("https://", adapter)
+    try:
+        session.post(
+            f"https://dev.narratoai.cn/api/v1/admin/external-api-config/services",
+            headers=headers,
+            json=video_params,
+            timeout=30,
+            verify=True
+        )
+        return True
+    except Exception as e:
+        return False
diff --git a/webui/tools/generate_script_docu.py b/webui/tools/generate_script_docu.py
new file mode 100644
index 0000000000000000000000000000000000000000..189d89780f327c076f586d522cb00f5e46d5b905
--- /dev/null
+++ b/webui/tools/generate_script_docu.py
@@ -0,0 +1,403 @@
+# 纪录片脚本生成
+import os
+import json
+import time
+import asyncio
+import traceback
+import streamlit as st
+from loguru import logger
+from datetime import datetime
+
+from app.config import config
+from app.utils import utils, video_processor
+from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config
+
+
+def generate_script_docu(params):
+    """
+    生成 纪录片 视频脚本
+    要求: 原视频无字幕无配音
+    适合场景: 纪录片、动物搞笑解说、荒野建造等
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            if not params.video_origin_path:
+                st.error("请先选择视频文件")
+                return
+            """
+            1. 提取键帧
+            """
+            update_progress(10, "正在提取关键帧...")
+
+            # 创建临时目录用于存储关键帧
+            keyframes_dir = os.path.join(utils.temp_dir(), "keyframes")
+            video_hash = utils.md5(params.video_origin_path + str(os.path.getmtime(params.video_origin_path)))
+            video_keyframes_dir = os.path.join(keyframes_dir, video_hash)
+
+            # 检查是否已经提取过关键帧
+            keyframe_files = []
+            if os.path.exists(video_keyframes_dir):
+                # 取已有的关键帧文件
+                for filename in sorted(os.listdir(video_keyframes_dir)):
+                    if filename.endswith('.jpg'):
+                        keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+
+                if keyframe_files:
+                    logger.info(f"使用已缓存的关键帧: {video_keyframes_dir}")
+                    st.info(f"使用已缓存的关键帧，如需重新提取请删除目录: {video_keyframes_dir}")
+                    update_progress(20, f"使用已缓存关键帧，共 {len(keyframe_files)} 帧")
+
+            # 如果没有缓存的关键帧，则进行提取
+            if not keyframe_files:
+                try:
+                    # 确保目录存在
+                    os.makedirs(video_keyframes_dir, exist_ok=True)
+
+                    # 初始化视频处理器
+                    processor = video_processor.VideoProcessor(params.video_origin_path)
+                    # 处理视频并提取关键帧
+                    processor.process_video_pipeline(
+                        output_dir=video_keyframes_dir,
+                        interval_seconds=st.session_state.get('frame_interval_input'),
+                    )
+
+                    # 获取所有关键文件路径
+                    for filename in sorted(os.listdir(video_keyframes_dir)):
+                        if filename.endswith('.jpg'):
+                            keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+
+                    if not keyframe_files:
+                        raise Exception("未提取到任何关键帧")
+
+                    update_progress(20, f"关键帧提取完成，共 {len(keyframe_files)} 帧")
+
+                except Exception as e:
+                    # 如果提取失败，清理创建的目录
+                    try:
+                        if os.path.exists(video_keyframes_dir):
+                            import shutil
+                            shutil.rmtree(video_keyframes_dir)
+                    except Exception as cleanup_err:
+                        logger.error(f"清理失败的关键帧目录时出错: {cleanup_err}")
+
+                    raise Exception(f"关键帧提取失败: {str(e)}")
+
+            """
+            2. 视觉分析(批量分析每一帧)
+            """
+            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
+            llm_params = dict()
+            logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}")
+
+            try:
+                # ===================初始化视觉分析器===================
+                update_progress(30, "正在初始化视觉分析器...")
+
+                # 从配置中获取相关配置
+                if vision_llm_provider == 'gemini':
+                    vision_api_key = st.session_state.get('vision_gemini_api_key')
+                    vision_model = st.session_state.get('vision_gemini_model_name')
+                    vision_base_url = st.session_state.get('vision_gemini_base_url')
+                else:
+                    vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key')
+                    vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name')
+                    vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url')
+
+                # 创建视觉分析器实例
+                llm_params = {
+                  "vision_provider": vision_llm_provider,
+                  "vision_api_key": vision_api_key,
+                  "vision_model_name": vision_model,
+                  "vision_base_url": vision_base_url,
+                }
+                analyzer = create_vision_analyzer(
+                    provider=vision_llm_provider,
+                    api_key=vision_api_key,
+                    model=vision_model,
+                    base_url=vision_base_url
+                )
+
+                update_progress(40, "正在分析关键帧...")
+
+                # ===================创建异步事件循环===================
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+
+                # 执行异步分析
+                vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
+                vision_analysis_prompt = """
+我提供了 %s 张视频帧，它们按时间顺序排列，代表一个连续的视频片段。请仔细分析每一帧的内容，并关注帧与帧之间的变化，以理解整个片段的活动。
+
+首先，请详细描述每一帧的关键视觉信息（包含：主要内容、人物、动作和场景）。
+然后，基于所有帧的分析，请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程。
+
+请务必使用 JSON 格式输出你的结果。JSON 结构应如下：
+{
+  "frame_observations": [
+    {
+      "frame_number": 1, // 或其他标识帧的方式
+      "observation": "描述每张视频帧中的主要内容、人物、动作和场景。"
+    },
+    // ... 更多帧的观察 ...
+  ],
+  "overall_activity_summary": "在这里填写你总结的整个片段的主要活动，保持简洁。"
+}
+
+请务必不要遗漏视频帧，我提供了 %s 张视频帧，frame_observations 必须包含 %s 个元素
+
+请只返回 JSON 字符串，不要包含任何其他解释性文字。
+                """
+                results = loop.run_until_complete(
+                    analyzer.analyze_images(
+                        images=keyframe_files,
+                        prompt=vision_analysis_prompt,
+                        batch_size=vision_batch_size
+                    )
+                )
+                loop.close()
+
+                """
+                3. 处理分析结果（格式化为 json 数据）
+                """
+                # ===================处理分析结果===================
+                update_progress(60, "正在整理分析结果...")
+
+                # 合并所有批次的分析结果
+                frame_analysis = ""
+                merged_frame_observations = []  # 合并所有批次的帧观察
+                overall_activity_summaries = []  # 合并所有批次的整体总结
+                prev_batch_files = None
+                frame_counter = 1  # 初始化帧计数器，用于给所有帧分配连续的序号
+                # logger.debug(json.dumps(results, indent=4, ensure_ascii=False))
+                # 确保分析目录存在
+                analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis")
+                os.makedirs(analysis_dir, exist_ok=True)
+                origin_res = os.path.join(analysis_dir, "frame_analysis.json")
+                with open(origin_res, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, ensure_ascii=False, indent=2)
+                
+                # 开始处理
+                for result in results:
+                    if 'error' in result:
+                        logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
+                        continue
+                        
+                    # 获取当前批次的文件列表
+                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
+                    logger.debug(f"批次 {result['batch_index']} 处理完成，共 {len(batch_files)} 张图片")
+                    
+                    # 获取批次的时间戳范围
+                    first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
+                    logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
+                    
+                    # 解析响应中的JSON数据
+                    response_text = result['response']
+                    try:
+                        # 处理可能包含```json```格式的响应
+                        if "```json" in response_text:
+                            json_content = response_text.split("```json")[1].split("```")[0].strip()
+                        elif "```" in response_text:
+                            json_content = response_text.split("```")[1].split("```")[0].strip()
+                        else:
+                            json_content = response_text.strip()
+                            
+                        response_data = json.loads(json_content)
+                        
+                        # 提取frame_observations和overall_activity_summary
+                        if "frame_observations" in response_data:
+                            frame_obs = response_data["frame_observations"]
+                            overall_summary = response_data.get("overall_activity_summary", "")
+                            
+                            # 添加时间戳信息到每个帧观察
+                            for i, obs in enumerate(frame_obs):
+                                if i < len(batch_files):
+                                    # 从文件名中提取时间戳
+                                    file_path = batch_files[i]
+                                    file_name = os.path.basename(file_path)
+                                    # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg)
+                                    # 格式解析: keyframe_帧序号_毫秒时间戳.jpg
+                                    timestamp_parts = file_name.split('_')
+                                    if len(timestamp_parts) >= 3:
+                                        timestamp_str = timestamp_parts[-1].split('.')[0]
+                                        try:
+                                            # 修正时间戳解析逻辑
+                                            # 格式为000100000，表示00:01:00,000，即1分钟
+                                            # 需要按照对应位数进行解析:
+                                            # 前两位是小时，中间两位是分钟，后面是秒和毫秒
+                                            if len(timestamp_str) >= 9:  # 确保格式正确
+                                                hours = int(timestamp_str[0:2])
+                                                minutes = int(timestamp_str[2:4])
+                                                seconds = int(timestamp_str[4:6])
+                                                milliseconds = int(timestamp_str[6:9])
+                                                
+                                                # 计算总秒数
+                                                timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                            else:
+                                                # 兼容旧的解析方式
+                                                timestamp_seconds = int(timestamp_str) / 1000  # 转换为秒
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                        except ValueError:
+                                            logger.warning(f"无法解析时间戳: {timestamp_str}")
+                                            timestamp_seconds = 0
+                                            formatted_time = "00:00:00,000"
+                                    else:
+                                        logger.warning(f"文件名格式不符合预期: {file_name}")
+                                        timestamp_seconds = 0
+                                        formatted_time = "00:00:00,000"
+                                    
+                                    # 添加额外信息到帧观察
+                                    obs["frame_path"] = file_path
+                                    obs["timestamp"] = formatted_time
+                                    obs["timestamp_seconds"] = timestamp_seconds
+                                    obs["batch_index"] = result['batch_index']
+                                    
+                                    # 使用全局递增的帧计数器替换原始的frame_number
+                                    if "frame_number" in obs:
+                                        obs["original_frame_number"] = obs["frame_number"]  # 保留原始编号作为参考
+                                    obs["frame_number"] = frame_counter  # 赋值连续的帧编号
+                                    frame_counter += 1  # 增加帧计数器
+                                    
+                                    # 添加到合并列表
+                                    merged_frame_observations.append(obs)
+                            
+                            # 添加批次整体总结信息
+                            if overall_summary:
+                                # 从文件名中提取时间戳数值
+                                first_time_str = first_timestamp.split('_')[-1].split('.')[0]
+                                last_time_str = last_timestamp.split('_')[-1].split('.')[0]
+                                
+                                # 转换为毫秒并计算持续时间（秒）
+                                try:
+                                    # 修正解析逻辑，与上面相同的方式解析时间戳
+                                    if len(first_time_str) >= 9 and len(last_time_str) >= 9:
+                                        # 解析第一个时间戳
+                                        first_hours = int(first_time_str[0:2])
+                                        first_minutes = int(first_time_str[2:4])
+                                        first_seconds = int(first_time_str[4:6])
+                                        first_ms = int(first_time_str[6:9])
+                                        first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000
+                                        
+                                        # 解析第二个时间戳
+                                        last_hours = int(last_time_str[0:2])
+                                        last_minutes = int(last_time_str[2:4])
+                                        last_seconds = int(last_time_str[4:6])
+                                        last_ms = int(last_time_str[6:9])
+                                        last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000
+                                        
+                                        batch_duration = last_time_seconds - first_time_seconds
+                                    else:
+                                        # 兼容旧的解析方式
+                                        first_time_ms = int(first_time_str)
+                                        last_time_ms = int(last_time_str)
+                                        batch_duration = (last_time_ms - first_time_ms) / 1000
+                                except ValueError:
+                                    # 使用 utils.time_to_seconds 函数处理格式化的时间戳
+                                    first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ','))
+                                    last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ','))
+                                    batch_duration = last_time_seconds - first_time_seconds
+                                
+                                overall_activity_summaries.append({
+                                    "batch_index": result['batch_index'],
+                                    "time_range": f"{first_timestamp}-{last_timestamp}",
+                                    "duration_seconds": batch_duration,
+                                    "summary": overall_summary
+                                })
+                    except Exception as e:
+                        logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}")
+                        # 添加原始响应作为回退
+                        frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+                        frame_analysis += response_text
+                        frame_analysis += "\n"
+                    
+                    # 更新上一个批次的文件
+                    prev_batch_files = batch_files
+                
+                # 将合并后的结果转为JSON字符串
+                merged_results = {
+                    "frame_observations": merged_frame_observations,
+                    "overall_activity_summaries": overall_activity_summaries
+                }
+                
+                # 使用当前时间创建文件名
+                now = datetime.now()
+                timestamp_str = now.strftime("%Y%m%d_%H%M")
+                
+                # 保存完整的分析结果为JSON
+                analysis_filename = f"frame_analysis_{timestamp_str}.json"
+                analysis_json_path = os.path.join(analysis_dir, analysis_filename)
+                with open(analysis_json_path, 'w', encoding='utf-8') as f:
+                    json.dump(merged_results, f, ensure_ascii=False, indent=2)
+                logger.info(f"分析结果已保存到: {analysis_json_path}")
+
+                """
+                4. 生成文案
+                """
+                logger.info("开始准备生成解说文案")
+                update_progress(80, "正在生成文案...")
+                from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration
+                # 从配置中获取文本生成相关配置
+                text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+                text_api_key = config.app.get(f'text_{text_provider}_api_key')
+                text_model = config.app.get(f'text_{text_provider}_model_name')
+                text_base_url = config.app.get(f'text_{text_provider}_base_url')
+                llm_params.update({
+                    "text_provider": text_provider,
+                    "text_api_key": text_api_key,
+                    "text_model_name": text_model,
+                    "text_base_url": text_base_url
+                })
+                chekc_video_config(llm_params)
+                # 整理帧分析数据
+                markdown_output = parse_frame_analysis_to_markdown(analysis_json_path)
+
+                # 生成解说文案
+                narration = generate_narration(
+                    markdown_output,
+                    text_api_key,
+                    base_url=text_base_url,
+                    model=text_model
+                )
+                narration_dict = json.loads(narration)['items']
+                # 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音
+                narration_dict = [{**item, "OST": 2} for item in narration_dict]
+                logger.debug(f"解说文案创作完成:\n{"\n".join([item['narration'] for item in narration_dict])}")
+                # 结果转换为JSON字符串
+                script = json.dumps(narration_dict, ensure_ascii=False, indent=2)
+
+            except Exception as e:
+                logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
+                raise Exception(f"分析失败: {str(e)}")
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.success(f"剪辑脚本生成完成")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(80, "脚本生成完成")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
+    finally:
+        time.sleep(2)
+        progress_bar.empty()
+        status_text.empty()
diff --git a/webui/tools/generate_script_short.py b/webui/tools/generate_script_short.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4508d95feb8b296ec8a452922c3643c0de84745
--- /dev/null
+++ b/webui/tools/generate_script_short.py
@@ -0,0 +1,92 @@
+import os
+import json
+import time
+import asyncio
+import traceback
+import requests
+import streamlit as st
+from loguru import logger
+
+from app.config import config
+from webui.tools.base import chekc_video_config
+
+
+def generate_script_short(tr, params, custom_clips=5):
+    """
+    生成短视频脚本
+    
+    Args:
+        tr: 翻译函数
+        params: 视频参数对象
+        custom_clips: 自定义片段数量，默认为5
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+            text_api_key = config.app.get(f'text_{text_provider}_api_key')
+            text_model = config.app.get(f'text_{text_provider}_model_name')
+            text_base_url = config.app.get(f'text_{text_provider}_base_url')
+            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
+            vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key', "")
+            vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name', "")
+            vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url', "")
+            narrato_api_key = config.app.get('narrato_api_key')
+
+            update_progress(20, "开始准备生成脚本")
+
+            srt_path = params.video_origin_path.replace(".mp4", ".srt").replace("videos", "srt").replace("video", "subtitle")
+            if not os.path.exists(srt_path):
+                logger.error(f"{srt_path} 文件不存在请检查或重新转录")
+                st.error(f"{srt_path} 文件不存在请检查或重新转录")
+                st.stop()
+
+            api_params = {
+                "vision_provider": vision_llm_provider,
+                "vision_api_key": vision_api_key,
+                "vision_model_name": vision_model,
+                "vision_base_url": vision_base_url or "",
+                "text_provider": text_provider,
+                "text_api_key": text_api_key,
+                "text_model_name": text_model,
+                "text_base_url": text_base_url or ""
+            }
+            chekc_video_config(api_params)
+            from app.services.SDP.generate_script_short import generate_script
+            script = generate_script(
+                srt_path=srt_path,
+                output_path="resource/scripts/merged_subtitle.json",
+                api_key=text_api_key,
+                model_name=text_model,
+                base_url=text_base_url,
+                custom_clips=custom_clips,
+            )
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.info(f"脚本生成完成 {json.dumps(script, ensure_ascii=False, indent=4)}")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(80, "脚本生成完成")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        progress_bar.progress(100)
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2a6f4969e23bcaee73aaf0061e558ac9716883
--- /dev/null
+++ b/webui/tools/generate_short_summary.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 短剧解说脚本生成
+@Author : 小林同学
+@Date   : 2025/5/10 下午10:26 
+'''
+import os
+import json
+import time
+import traceback
+import streamlit as st
+from loguru import logger
+
+from app.config import config
+from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script
+
+
+def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature):
+    """
+    生成 短剧解说 视频脚本
+    要求: 提供高质量短剧字幕
+    适合场景: 短剧
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            if not params.video_origin_path:
+                st.error("请先选择视频文件")
+                return
+            """
+            1. 获取字幕
+            """
+            update_progress(30, "正在解析字幕...")
+            # 判断字幕文件是否存在
+            if not os.path.exists(subtitle_path):
+                st.error("字幕文件不存在")
+                return
+
+            """
+            2. 分析字幕总结剧情
+            """
+            text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+            text_api_key = config.app.get(f'text_{text_provider}_api_key')
+            text_model = config.app.get(f'text_{text_provider}_model_name')
+            text_base_url = config.app.get(f'text_{text_provider}_base_url')
+            analysis_result = analyze_subtitle(
+                subtitle_file_path=subtitle_path,
+                api_key=text_api_key,
+                model=text_model,
+                base_url=text_base_url,
+                save_result=True,
+                temperature=temperature
+            )
+            """
+            3. 根据剧情生成解说文案
+            """
+            if analysis_result["status"] == "success":
+                logger.info("字幕分析成功！")
+                update_progress(60, "正在生成文案...")
+
+                # 根据剧情生成解说文案
+                narration_result = generate_narration_script(
+                    short_name=video_theme,
+                    plot_analysis=analysis_result["analysis"],
+                    api_key=text_api_key,
+                    model=text_model,
+                    base_url=text_base_url,
+                    save_result=True,
+                    temperature=temperature
+                )
+
+                if narration_result["status"] == "success":
+                    logger.info("\n解说文案生成成功！")
+                    logger.info(narration_result["narration_script"])
+                else:
+                    logger.info(f"\n解说文案生成失败: {narration_result['message']}")
+                    st.error("生成脚本失败，请检查日志")
+                    st.stop()
+            else:
+                logger.error(f"分析失败: {analysis_result['message']}")
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+
+            """
+            4. 生成文案
+            """
+            logger.info("开始准备生成解说文案")
+
+            # 结果转换为JSON字符串
+            narration_script = narration_result["narration_script"]
+            narration_dict = json.loads(narration_script)
+            script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2)
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.success(f"剪辑脚本生成完成")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(90, "整理输出...")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
+    finally:
+        time.sleep(2)
+        progress_bar.empty()
+        status_text.empty()
diff --git a/webui/utils/cache.py b/webui/utils/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cc3b05af5b6ffe81d374a3e1946f2cac10ee136
--- /dev/null
+++ b/webui/utils/cache.py
@@ -0,0 +1,33 @@
+import streamlit as st
+import os
+import glob
+from app.utils import utils
+
+def get_fonts_cache(font_dir):
+    if 'fonts_cache' not in st.session_state:
+        fonts = []
+        for root, dirs, files in os.walk(font_dir):
+            for file in files:
+                if file.endswith(".ttf") or file.endswith(".ttc"):
+                    fonts.append(file)
+        fonts.sort()
+        st.session_state['fonts_cache'] = fonts
+    return st.session_state['fonts_cache']
+
+def get_video_files_cache():
+    if 'video_files_cache' not in st.session_state:
+        video_files = []
+        for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]:
+            video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix)))
+        st.session_state['video_files_cache'] = video_files[::-1]
+    return st.session_state['video_files_cache']
+
+def get_songs_cache(song_dir):
+    if 'songs_cache' not in st.session_state:
+        songs = []
+        for root, dirs, files in os.walk(song_dir):
+            for file in files:
+                if file.endswith(".mp3"):
+                    songs.append(file)
+        st.session_state['songs_cache'] = songs
+    return st.session_state['songs_cache'] 
\ No newline at end of file
diff --git a/webui/utils/file_utils.py b/webui/utils/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b1238e283f95f5f347de9c3e2365c3fd48662b
--- /dev/null
+++ b/webui/utils/file_utils.py
@@ -0,0 +1,230 @@
+import os
+import glob
+import time
+import platform
+import shutil
+from uuid import uuid4
+from loguru import logger
+from app.utils import utils
+
+def open_task_folder(root_dir, task_id):
+    """打开任务文件夹
+    Args:
+        root_dir: 项目根目录
+        task_id: 任务ID
+    """
+    try:
+        sys = platform.system()
+        path = os.path.join(root_dir, "storage", "tasks", task_id)
+        if os.path.exists(path):
+            if sys == 'Windows':
+                os.system(f"start {path}")
+            if sys == 'Darwin':
+                os.system(f"open {path}")
+            if sys == 'Linux':
+                os.system(f"xdg-open {path}")
+    except Exception as e:
+        logger.error(f"打开任务文件夹失败: {e}")
+
+def cleanup_temp_files(temp_dir, max_age=3600):
+    """清理临时文件
+    Args:
+        temp_dir: 临时文件目录
+        max_age: 文件最大保存时间(秒)
+    """
+    if os.path.exists(temp_dir):
+        for file in os.listdir(temp_dir):
+            file_path = os.path.join(temp_dir, file)
+            try:
+                if os.path.getctime(file_path) < time.time() - max_age:
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                    logger.debug(f"已清理临时文件: {file_path}")
+            except Exception as e:
+                logger.error(f"清理临时文件失败: {file_path}, 错误: {e}")
+
+def get_file_list(directory, file_types=None, sort_by='ctime', reverse=True):
+    """获取指定目录下的文件列表
+    Args:
+        directory: 目录路径
+        file_types: 文件类型列表，如 ['.mp4', '.mov']
+        sort_by: 排序方式，支持 'ctime'(创建时间), 'mtime'(修改时间), 'size'(文件大小), 'name'(文件名)
+        reverse: 是否倒序排序
+    Returns:
+        list: 文件信息列表
+    """
+    if not os.path.exists(directory):
+        return []
+    
+    files = []
+    if file_types:
+        for file_type in file_types:
+            files.extend(glob.glob(os.path.join(directory, f"*{file_type}")))
+    else:
+        files = glob.glob(os.path.join(directory, "*"))
+    
+    file_list = []
+    for file_path in files:
+        try:
+            file_stat = os.stat(file_path)
+            file_info = {
+                "name": os.path.basename(file_path),
+                "path": file_path,
+                "size": file_stat.st_size,
+                "ctime": file_stat.st_ctime,
+                "mtime": file_stat.st_mtime
+            }
+            file_list.append(file_info)
+        except Exception as e:
+            logger.error(f"获取文件信息失败: {file_path}, 错误: {e}")
+    
+    # 排序
+    if sort_by in ['ctime', 'mtime', 'size', 'name']:
+        file_list.sort(key=lambda x: x.get(sort_by, ''), reverse=reverse)
+    
+    return file_list
+
+def save_uploaded_file(uploaded_file, save_dir, allowed_types=None):
+    """保存上传的文件
+    Args:
+        uploaded_file: StreamlitUploadedFile对象
+        save_dir: 保存目录
+        allowed_types: 允许的文件类型列表，如 ['.mp4', '.mov']
+    Returns:
+        str: 保存后的文件路径，失败返回None
+    """
+    try:
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        
+        file_name, file_extension = os.path.splitext(uploaded_file.name)
+        
+        # 检查文件类型
+        if allowed_types and file_extension.lower() not in allowed_types:
+            logger.error(f"不支持的文件类型: {file_extension}")
+            return None
+        
+        # 如果文件已存在，添加时间戳
+        save_path = os.path.join(save_dir, uploaded_file.name)
+        if os.path.exists(save_path):
+            timestamp = time.strftime("%Y%m%d%H%M%S")
+            new_file_name = f"{file_name}_{timestamp}{file_extension}"
+            save_path = os.path.join(save_dir, new_file_name)
+        
+        # 保存文件
+        with open(save_path, "wb") as f:
+            f.write(uploaded_file.read())
+        
+        logger.info(f"文件保存成功: {save_path}")
+        return save_path
+    
+    except Exception as e:
+        logger.error(f"保存上传文件失败: {e}")
+        return None
+
+def create_temp_file(prefix='tmp', suffix='', directory=None):
+    """创建临时文件
+    Args:
+        prefix: 文件名前缀
+        suffix: 文件扩展名
+        directory: 临时文件目录，默认使用系统临时目录
+    Returns:
+        str: 临时文件路径
+    """
+    try:
+        if directory is None:
+            directory = utils.storage_dir("temp", create=True)
+        
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        
+        temp_file = os.path.join(directory, f"{prefix}-{str(uuid4())}{suffix}")
+        return temp_file
+    
+    except Exception as e:
+        logger.error(f"创建临时文件失败: {e}")
+        return None
+
+def get_file_size(file_path, format='MB'):
+    """获取文件大小
+    Args:
+        file_path: 文件路径
+        format: 返回格式，支持 'B', 'KB', 'MB', 'GB'
+    Returns:
+        float: 文件大小
+    """
+    try:
+        size_bytes = os.path.getsize(file_path)
+        
+        if format.upper() == 'B':
+            return size_bytes
+        elif format.upper() == 'KB':
+            return size_bytes / 1024
+        elif format.upper() == 'MB':
+            return size_bytes / (1024 * 1024)
+        elif format.upper() == 'GB':
+            return size_bytes / (1024 * 1024 * 1024)
+        else:
+            return size_bytes
+    
+    except Exception as e:
+        logger.error(f"获取文件大小失败: {file_path}, 错误: {e}")
+        return 0
+
+def ensure_directory(directory):
+    """确保目录存在，如果不存在则创建
+    Args:
+        directory: 目录路径
+    Returns:
+        bool: 是否成功
+    """
+    try:
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        return True
+    except Exception as e:
+        logger.error(f"创建目录失败: {directory}, 错误: {e}")
+        return False
+
+def create_zip(files: list, zip_path: str, base_dir: str = None, folder_name: str = "demo") -> bool:
+    """
+    创建zip文件
+    Args:
+        files: 要打包的文件列表
+        zip_path: zip文件保存路径
+        base_dir: 基础目录，用于保持目录结构
+        folder_name: zip解压后的文件夹名称，默认为frames
+    Returns:
+        bool: 是否成功
+    """
+    try:
+        import zipfile
+        
+        # 确保目标目录存在
+        os.makedirs(os.path.dirname(zip_path), exist_ok=True)
+        
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for file in files:
+                if not os.path.exists(file):
+                    logger.warning(f"文件不存在，跳过: {file}")
+                    continue
+                    
+                # 计算文件在zip中的路径，添加folder_name作为前缀目录
+                if base_dir:
+                    arcname = os.path.join(folder_name, os.path.relpath(file, base_dir))
+                else:
+                    arcname = os.path.join(folder_name, os.path.basename(file))
+                
+                try:
+                    zipf.write(file, arcname)
+                except Exception as e:
+                    logger.error(f"添加文件到zip失败: {file}, 错误: {e}")
+                    continue
+
+        return True
+        
+    except Exception as e:
+        logger.error(f"创建zip文件失败: {e}")
+        return False
\ No newline at end of file
diff --git a/webui/utils/merge_video.py b/webui/utils/merge_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d21de30ac7c96fc96c8bee6798bb17c71549f5f
--- /dev/null
+++ b/webui/utils/merge_video.py
@@ -0,0 +1,115 @@
+"""
+合并视频和字幕文件
+"""
+import os
+import pysrt
+from moviepy import VideoFileClip, concatenate_videoclips
+
+
+def get_video_duration(video_path):
+    """获取视频时长（秒）"""
+    video = VideoFileClip(video_path)
+    duration = video.duration
+    video.close()
+    return duration
+
+
+def adjust_subtitle_timing(subtitle_path, time_offset):
+    """调整字幕时间戳"""
+    subs = pysrt.open(subtitle_path)
+
+    # 为每个字幕项添加时间偏移
+    for sub in subs:
+        sub.start.hours += int(time_offset / 3600)
+        sub.start.minutes += int((time_offset % 3600) / 60)
+        sub.start.seconds += int(time_offset % 60)
+        sub.start.milliseconds += int((time_offset * 1000) % 1000)
+
+        sub.end.hours += int(time_offset / 3600)
+        sub.end.minutes += int((time_offset % 3600) / 60)
+        sub.end.seconds += int(time_offset % 60)
+        sub.end.milliseconds += int((time_offset * 1000) % 1000)
+
+    return subs
+
+
+def merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path):
+    """合并视频和字幕文件"""
+    if len(video_paths) != len(subtitle_paths):
+        raise ValueError("视频文件数量与字幕文件数量不匹配")
+
+    # 1. 合并视频
+    video_clips = []
+    accumulated_duration = 0
+    merged_subs = pysrt.SubRipFile()
+
+    try:
+        # 处理所有视频和字幕
+        for i, (video_path, subtitle_path) in enumerate(zip(video_paths, subtitle_paths)):
+            # 添加视频
+            print(f"处理视频 {i + 1}/{len(video_paths)}: {video_path}")
+            video_clip = VideoFileClip(video_path)
+            video_clips.append(video_clip)
+
+            # 处理字幕
+            print(f"处理字幕 {i + 1}/{len(subtitle_paths)}: {subtitle_path}")
+            if i == 0:
+                # 第一个字幕文件直接读取
+                current_subs = pysrt.open(subtitle_path)
+            else:
+                # 后续字幕文件需要调整时间戳
+                current_subs = adjust_subtitle_timing(subtitle_path, accumulated_duration)
+
+            # 合并字幕
+            merged_subs.extend(current_subs)
+
+            # 更新累计时长
+            accumulated_duration += video_clip.duration
+
+        # 判断视频是否存在，若已经存在不重复合并
+        if not os.path.exists(output_video_path):
+            print("合并视频中...")
+            final_video = concatenate_videoclips(video_clips)
+
+            # 保存合并后的视频
+            print("保存合并后的视频...")
+            final_video.write_videofile(output_video_path, audio_codec='aac')
+
+        # 保存合并后的字幕
+        print("保存合并后的字幕...")
+        merged_subs.save(output_subtitle_path, encoding='utf-8')
+
+        print("合并完成")
+
+    finally:
+        # 清理资源
+        for clip in video_clips:
+            clip.close()
+
+
+def main():
+    # 示例用法
+    video_paths = [
+        "temp/1.mp4",
+        "temp/2.mp4",
+        "temp/3.mp4",
+        "temp/4.mp4",
+        "temp/5.mp4",
+    ]
+
+    subtitle_paths = [
+        "temp/1.srt",
+        "temp/2.srt",
+        "temp/3.srt",
+        "temp/4.srt",
+        "temp/5.srt",
+    ]
+
+    output_video_path = "temp/merged_video.mp4"
+    output_subtitle_path = "temp/merged_subtitle.srt"
+
+    merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/webui/utils/vision_analyzer.py b/webui/utils/vision_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e0fecd1a5aa2963401bfffa9721a6496cfeddad
--- /dev/null
+++ b/webui/utils/vision_analyzer.py
@@ -0,0 +1,100 @@
+import logging
+from typing import List, Dict, Any, Optional
+from app.utils import gemini_analyzer, qwenvl_analyzer
+
+logger = logging.getLogger(__name__)
+
+class VisionAnalyzer:
+    def __init__(self):
+        self.provider = None
+        self.api_key = None
+        self.model = None
+        self.base_url = None
+        self.analyzer = None
+        
+    def initialize_gemini(self, api_key: str, model: str, base_url: str) -> None:
+        """
+        初始化Gemini视觉分析器
+        
+        Args:
+            api_key: Gemini API密钥
+            model: 模型名称
+            base_url: API基础URL
+        """
+        self.provider = 'gemini'
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.analyzer = gemini_analyzer.VisionAnalyzer(
+            model_name=model,
+            api_key=api_key
+        )
+
+    def initialize_qwenvl(self, api_key: str, model: str, base_url: str) -> None:
+        """
+        初始化QwenVL视觉分析器
+        
+        Args:
+            api_key: 阿里云API密钥
+            model: 模型名称
+            base_url: API基础URL
+        """
+        self.provider = 'qwenvl'
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.analyzer = qwenvl_analyzer.QwenAnalyzer(
+            model_name=model,
+            api_key=api_key
+        )
+        
+    async def analyze_images(self, images: List[str], prompt: str, batch_size: int = 5) -> Dict[str, Any]:
+        """
+        分析图片内容
+        
+        Args:
+            images: 图片路径列表
+            prompt: 分析提示词
+            batch_size: 每批处理的图片数量，默认为5
+            
+        Returns:
+            Dict: 分析结果
+        """
+        if not self.analyzer:
+            raise ValueError("未初始化视觉分析器")
+            
+        return await self.analyzer.analyze_images(
+            images=images,
+            prompt=prompt,
+            batch_size=batch_size
+        )
+
+def create_vision_analyzer(provider: str, **kwargs) -> VisionAnalyzer:
+    """
+    创建视觉分析器实例
+    
+    Args:
+        provider: 提供商名称 ('gemini' 或 'qwenvl')
+        **kwargs: 提供商特定的配置参数
+        
+    Returns:
+        VisionAnalyzer: 配置好的视觉分析器实例
+    """
+    analyzer = VisionAnalyzer()
+    
+    if provider.lower() == 'gemini':
+        analyzer.initialize_gemini(
+            api_key=kwargs.get('api_key'),
+            model=kwargs.get('model'),
+            base_url=kwargs.get('base_url')
+        )
+    elif provider.lower() == 'qwenvl':
+        analyzer.initialize_qwenvl(
+            api_key=kwargs.get('api_key'),
+            model=kwargs.get('model'),
+            base_url=kwargs.get('base_url')
+        )
+    else:
+        raise ValueError(f"不支持的视觉分析提供商: {provider}")
+        
+    return analyzer
\ No newline at end of file