Spaces:

weiyi01191
/

DeepOperateAI-Video

Running

App Files Files Community

weiyi01191 commited on Jun 10

Commit

1f33751

verified ·

1 Parent(s): 8feb86f

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -75

app.py CHANGED Viewed

@@ -200,45 +200,20 @@ def get_subtitles(video_path):
     try:
         extract_audio(video_path, audio_path)
-        # 🔧 优化中文语音识别
-        print("🎙️ 开始中文语音识别...")
-        result = whisper_model.transcribe(
-            audio_path,
-            language="zh",  # 指定中文
-            task="transcribe",  # 转录任务
-            word_timestamps=True,  # 启用词级时间戳
-            verbose=True,  # 详细输出
-            temperature=0.2,  # 降低温度提高准确性
-            beam_size=5,  # 增加beam size
-            best_of=5,  # 最佳候选数
-            fp16=True  # 使用fp16加速
-        )
-        # 创建VTT文件，包含更详细的信息
         with open(subtitle_path, "w", encoding="utf-8") as vtt_file:
             vtt_file.write("WEBVTT\n\n")
-            for i, segment in enumerate(result['segments']):
                 start = format_timestamp(segment['start'])
                 end = format_timestamp(segment['end'])
-                text = segment['text'].strip()
-                # 🔧 添加说话人标识和详细信息
-                if text:
-                    # 尝试识别说话人（基于音频特征变化）
-                    speaker_id = f"说话人{(i // 3) % 3 + 1}"  # 简单的说话人分配策略
-                    # 格式化字幕，包含说话人信息
-                    formatted_text = f"【{speaker_id}】: {text}"
-                    vtt_file.write(f"{start} --> {end}\n{formatted_text}\n\n")
-        print(f"✅ 中文字幕生成完成: {len(result['segments'])} 个片段")
         return subtitle_path
     except Exception as e:
-        print(f"❌ 中文字幕生成错误: {e}")
         return None
 def prepare_input(video_path, subtitle_path, instruction):
@@ -485,7 +460,7 @@ def load_minigpt4_model():
             print(f"💾 模型加载后显存使用: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
         print("🚀 开始初始化Whisper模型...")
-        whisper_model = whisper.load_model("medium").to(f"cuda:{whisper_gpu_id}" if torch.cuda.is_available() else "cpu")  # 升级到medium模型提高中文识别准确性
         if torch.cuda.is_available():
             print(f"💾 全部加载后显存使用: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
@@ -535,29 +510,15 @@ def analyze_video_with_minigpt4(video_file, instruction):
             temp_video_path = os.path.join(temp_dir, "analysis_video.mp4")
             shutil.copy2(video_file, temp_video_path)
-        # 🔧 使用详细的中文分析指令
         if not instruction or instruction.strip() == "":
-            instruction = """请非常详细地分析这个视频的内容，包括：
-1. **场景描述**: 详细描述视频中的环境、背景、位置等
-2. **人物分析**: 描述出现的每个人物的外貌、服装、动作
-3. **对话内容**: 准确记录视频中每个人说的话，包括：
-   - 谁在说话
-   - 具体说了什么内容
-   - 说话的语气和情感
-4. **动作描述**: 详细描述人物的动作、手势、表情变化
-5. **声音元素**: 除了对话，还有背景音乐、音效等
-6. **画面细节**: 物品、文字、标识等可见元素
-7. **情节发展**: 视频的故事情节或事件发展过程
-请用中文详细描述，特别注意记录完整的对话内容和说话人信息。"""
         # 调用MiniGPT4-Video的生成函数
-        print("🎬 开始MiniGPT4-Video深度内容分析...")
         prediction = generate_prediction(
             video_path=temp_video_path,
             instruction=instruction,
-            gen_subtitles=True,  # 生成中文字幕
             stream=False
         )
@@ -572,24 +533,21 @@ def analyze_video_with_minigpt4(video_file, instruction):
 📋 **基本信息**:
 - 视频文件: {os.path.basename(video_file)}
 - 分析设备: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU模式'}
-- 语言设置: 中文优化
-- 字幕识别: 中文语音转文字
-🎭 **详细视频内容分析**:
 {prediction}
 {format_violations_report(violations_result)}
 📊 **技术信息**:
-- 内容理解: MiniGPT4-Video + Whisper中文识别
 - 规则引擎: 巨量引擎299条禁投规则
 - 检测等级: P1(低危) + P2(中危) + P3(高危)
-- 分析模式: 多模态理解 (视觉+中文语音+文本)
-- 对话识别: 说话人标识 + 完整对话内容
 💡 **说明**:
-基于MiniGPT4-Video的深度中文内容理解，结合Whisper中文语音识别和巨量引擎完整禁投规则库进行专业违规检测。
-特别优化了中文对话识别和说话人标识功能。
         """
         # 获取综合风险等级
@@ -607,7 +565,7 @@ def analyze_video_with_minigpt4(video_file, instruction):
 1. 检查视频文件格式 (建议MP4)
 2. 确认模型文件是否正确加载
 3. 检查GPU内存是否充足
-4. 验证中文语音识别配置
 💡 **提示**: 如果问题持续，请检查模型和依赖项安装
         """
@@ -622,8 +580,8 @@ def create_app():
             gr.Video(label="上传视频文件"),
             gr.Textbox(
                 label="分析指令",
-                value="请详细分析视频内容，包括完整的对话记录。请准确记录谁说了什么话，以及所有可见和可听的元素。",
-                placeholder="输入您希望AI如何分析这个视频...\n例如：请记录视频中所有人的对话内容和说话人信息",
                 lines=3
             )
         ],
@@ -631,25 +589,18 @@ def create_app():
             gr.Textbox(label="MiniGPT4-Video 内容分析 + 巨量引擎规则检测", lines=20),
             gr.Textbox(label="巨量引擎风险评级")
         ],
-        title="🎥 智能视频内容安全分析 - MiniGPT4-Video + 巨量引擎 (中文优化版)",
         description="""
-        ## 🎬 基于MiniGPT4-Video + 巨量引擎299条禁投规则的专业视频安全检测系统 (中文对话优化)
-        ⚡ **ZeroGPU加速** | 🎬 **MiniGPT4-Video** | 🎙️ **Whisper中文语音** | 🛡️ **巨量引擎299条规则**
         **🔥 核心功能:**
         - 🎞️ **深度视频理解**: MiniGPT4-Video多模态分析
-        - 🎙️ **中文语音识别**: Whisper Medium模型，精准转录中文对话
-        - 👥 **说话人识别**: 自动标识不同说话人和完整对话内容
         - 🛡️ **专业违规检测**: 巨量引擎完整禁投规则库
         - 📊 **智能风险评级**: P0-P3四级风险等级
-        **🎯 中文优化特色:**
-        - **🎭 对话记录**: 准确识别"谁说了什么话"
-        - **📝 完整转录**: 中文语音转文字，包含说话人信息
-        - **🎬 详细分析**: 场景、人物、动作、情感的全面描述
-        - **🔍 内容理解**: 专门优化的中文内容分析指令
         **🎯 检测维度:**
         - **高危(P3)**: 违法出版物、烟草、医疗等严重违规
         - **中危(P2)**: 赌博周边、房地产、金融等中等风险
@@ -660,10 +611,10 @@ def create_app():
         金融类、医疗类、烟草类等全部299条巨量引擎禁投规则
         """,
         examples=[
-            [None, "请记录视频中所有人的完整对话内容，包括谁说了什么话"],
-            [None, "详细分析视频场景和人物对话，检测是否包含禁投内容"],
-            [None, "分析视频中的说话人信息和对话内容，评估投放风险"],
-            [None, "请提供完整的视频内容描述，包括所有可听见的对话"]
         ],
         cache_examples=False
     )

     try:
         extract_audio(video_path, audio_path)
+        result = whisper_model.transcribe(audio_path, language="en")
+        # 创建VTT文件
         with open(subtitle_path, "w", encoding="utf-8") as vtt_file:
             vtt_file.write("WEBVTT\n\n")
+            for segment in result['segments']:
                 start = format_timestamp(segment['start'])
                 end = format_timestamp(segment['end'])
+                text = segment['text']
+                vtt_file.write(f"{start} --> {end}\n{text}\n\n")
         return subtitle_path
     except Exception as e:
+        print(f"字幕生成错误: {e}")
         return None
 def prepare_input(video_path, subtitle_path, instruction):
             print(f"💾 模型加载后显存使用: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
         print("🚀 开始初始化Whisper模型...")
+        whisper_model = whisper.load_model("base").to(f"cuda:{whisper_gpu_id}" if torch.cuda.is_available() else "cpu")
         if torch.cuda.is_available():
             print(f"💾 全部加载后显存使用: {torch.cuda.memory_allocated(0) / 1024**3:.1f} GB")
             temp_video_path = os.path.join(temp_dir, "analysis_video.mp4")
             shutil.copy2(video_file, temp_video_path)
+        # 使用MiniGPT4-Video进行真实分析
         if not instruction or instruction.strip() == "":
+            instruction = "请详细分析这个视频的内容，包括场景、人物、动作、对话等，并描述所有可见和可听的元素。"
         # 调用MiniGPT4-Video的生成函数
         prediction = generate_prediction(
             video_path=temp_video_path,
             instruction=instruction,
+            gen_subtitles=True,  # 生成字幕
             stream=False
         )
 📋 **基本信息**:
 - 视频文件: {os.path.basename(video_file)}
 - 分析设备: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU模式'}
+- 分析指令: {instruction}
+🔍 **视频内容描述**:
 {prediction}
 {format_violations_report(violations_result)}
 📊 **技术信息**:
+- 内容理解: MiniGPT4-Video + Whisper
 - 规则引擎: 巨量引擎299条禁投规则
 - 检测等级: P1(低危) + P2(中危) + P3(高危)
+- 分析模式: 多模态理解 (视觉+语音+文本)
 💡 **说明**:
+基于MiniGPT4-Video的深度内容理解，结合巨量引擎完整禁投规则库进行专业违规检测。
         """
         # 获取综合风险等级
 1. 检查视频文件格式 (建议MP4)
 2. 确认模型文件是否正确加载
 3. 检查GPU内存是否充足
+4. 验证配置文件路径
 💡 **提示**: 如果问题持续，请检查模型和依赖项安装
         """
             gr.Video(label="上传视频文件"),
             gr.Textbox(
                 label="分析指令",
+                value="请详细分析这个视频的内容，包括场景、人物、动作、对话等，并描述所有可见和可听的元素。",
+                placeholder="输入您希望AI如何分析这个视频...",
                 lines=3
             )
         ],
             gr.Textbox(label="MiniGPT4-Video 内容分析 + 巨量引擎规则检测", lines=20),
             gr.Textbox(label="巨量引擎风险评级")
         ],
+        title="🎥 智能视频内容安全分析 - MiniGPT4-Video + 巨量引擎",
         description="""
+        ## 🎬 基于MiniGPT4-Video + 巨量引擎299条禁投规则的专业视频安全检测系统
+        ⚡ **ZeroGPU加速** | 🎬 **MiniGPT4-Video** | 🎙️ **Whisper语音** | 🛡️ **巨量引擎299条规则**
         **🔥 核心功能:**
         - 🎞️ **深度视频理解**: MiniGPT4-Video多模态分析
+        - 🎙️ **语音转文字**: Whisper自动生成字幕
         - 🛡️ **专业违规检测**: 巨量引擎完整禁投规则库
         - 📊 **智能风险评级**: P0-P3四级风险等级
         **🎯 检测维度:**
         - **高危(P3)**: 违法出版物、烟草、医疗等严重违规
         - **中危(P2)**: 赌博周边、房地产、金融等中等风险
         金融类、医疗类、烟草类等全部299条巨量引擎禁投规则
         """,
         examples=[
+            [None, "分析这个视频是否包含禁投内容"],
+            [None, "检测视频中是否有巨量引擎禁止的产品或服务"],
+            [None, "评估视频内容的投放风险等级"],
+            [None, "详细描述视频内容并进行合规检查"]
         ],
         cache_examples=False
     )