shukdevdattaEX's picture
Create app.py
25e56df verified
raw
history blame
13.5 kB
import gradio as gr
import os
import tempfile
import subprocess
import librosa
import soundfile as sf
import torch
from pathlib import Path
import traceback
from typing import List, Dict, Tuple, Optional
# Install required packages
def install_requirements():
"""Install required packages if not already installed"""
try:
import nemo
print("NeMo already installed")
except ImportError:
print("Installing NeMo...")
subprocess.run([
"pip", "install",
"nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
], check=True)
try:
import moviepy
print("MoviePy already installed")
except ImportError:
print("Installing MoviePy...")
subprocess.run(["pip", "install", "moviepy"], check=True)
# Try to install requirements
try:
install_requirements()
from nemo.collections.speechlm2.models import SALM
import moviepy.editor as mp
DEPENDENCIES_AVAILABLE = True
except Exception as e:
print(f"Warning: Could not install dependencies: {e}")
DEPENDENCIES_AVAILABLE = False
class VideoQASummarizer:
def __init__(self):
self.model = None
self.current_transcript = ""
self.model_loaded = False
def load_model(self):
"""Load the Canary-Qwen-2.5B model"""
if not DEPENDENCIES_AVAILABLE:
return "Error: Required dependencies not available. Please install manually."
try:
if self.model is None:
print("Loading Canary-Qwen-2.5B model...")
self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
self.model_loaded = True
return "Model loaded successfully!"
return "Model already loaded."
except Exception as e:
error_msg = f"Error loading model: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg
def extract_audio_from_video(self, video_path: str) -> str:
"""Extract audio from video file"""
try:
# Create temporary audio file
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
temp_audio_path = temp_audio.name
temp_audio.close()
# Load video and extract audio
video = mp.VideoFileClip(video_path)
audio = video.audio
# Write audio to temporary file
audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
# Clean up
audio.close()
video.close()
return temp_audio_path
except Exception as e:
raise Exception(f"Error extracting audio: {str(e)}")
def preprocess_audio(self, audio_path: str) -> str:
"""Preprocess audio for the model (ensure correct format)"""
try:
# Load audio
audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz if needed
# Create new temporary file for processed audio
temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
temp_processed_path = temp_processed.name
temp_processed.close()
# Save processed audio
sf.write(temp_processed_path, audio, 16000)
return temp_processed_path
except Exception as e:
raise Exception(f"Error preprocessing audio: {str(e)}")
def transcribe_audio(self, audio_path: str) -> str:
"""Transcribe audio using Canary-Qwen-2.5B in ASR mode"""
try:
if not self.model_loaded:
return "Error: Model not loaded. Please load the model first."
# Preprocess audio
processed_audio_path = self.preprocess_audio(audio_path)
# Transcribe using ASR mode
answer_ids = self.model.generate(
prompts=[
[{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
],
max_new_tokens=512,
)
transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
# Clean up temporary file
os.unlink(processed_audio_path)
return transcript.strip()
except Exception as e:
error_msg = f"Error during transcription: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg
def answer_question(self, question: str, transcript: str) -> str:
"""Answer questions about the transcript using LLM mode"""
try:
if not self.model_loaded:
return "Error: Model not loaded. Please load the model first."
if not transcript:
return "Error: No transcript available. Please transcribe a video first."
# Use LLM mode to answer questions
prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
with self.model.llm.disable_adapter():
answer_ids = self.model.generate(
prompts=[[{"role": "user", "content": prompt}]],
max_new_tokens=512,
)
answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
return answer.strip()
except Exception as e:
error_msg = f"Error answering question: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg
def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
"""Summarize the transcript using LLM mode"""
try:
if not self.model_loaded:
return "Error: Model not loaded. Please load the model first."
if not transcript:
return "Error: No transcript available. Please transcribe a video first."
# Create different summary prompts based on type
if summary_type == "bullet_points":
prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
elif summary_type == "detailed":
prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
else: # general
prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
with self.model.llm.disable_adapter():
answer_ids = self.model.generate(
prompts=[[{"role": "user", "content": prompt}]],
max_new_tokens=1024,
)
summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
return summary.strip()
except Exception as e:
error_msg = f"Error creating summary: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg
# Initialize the model
qa_summarizer = VideoQASummarizer()
def load_model_interface():
"""Interface function to load the model"""
return qa_summarizer.load_model()
def process_video(video_file):
"""Process uploaded video and return transcript"""
if video_file is None:
return "Please upload a video file.", ""
try:
# Extract audio from video
status_msg = "Extracting audio from video..."
audio_path = qa_summarizer.extract_audio_from_video(video_file)
# Transcribe audio
status_msg = "Transcribing audio..."
transcript = qa_summarizer.transcribe_audio(audio_path)
# Store transcript for later use
qa_summarizer.current_transcript = transcript
# Clean up temporary audio file
if os.path.exists(audio_path):
os.unlink(audio_path)
return "Video processed successfully!", transcript
except Exception as e:
error_msg = f"Error processing video: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg, ""
def answer_question_interface(question, transcript):
"""Interface function to answer questions"""
if not question.strip():
return "Please enter a question."
return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
def summarize_interface(transcript, summary_type):
"""Interface function to create summaries"""
return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
# Create Gradio interface
def create_interface():
with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# πŸŽ₯ Video Question Answering and Summarizer
Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
**Features:**
- Extract and transcribe audio from video files
- Ask questions about the video content
- Generate different types of summaries
- Powered by NVIDIA NeMo Canary-Qwen-2.5B
""")
# Model loading section
with gr.Row():
gr.Markdown("## πŸš€ Step 1: Load Model")
with gr.Row():
load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary")
model_status = gr.Textbox(label="Model Status", interactive=False)
load_btn.click(load_model_interface, outputs=model_status)
# Video processing section
with gr.Row():
gr.Markdown("## πŸ“Ή Step 2: Upload and Process Video")
with gr.Row():
with gr.Column():
video_input = gr.Video(label="Upload Video File")
process_btn = gr.Button("Process Video", variant="primary")
with gr.Column():
process_status = gr.Textbox(label="Processing Status", interactive=False)
transcript_output = gr.Textbox(
label="Transcript",
lines=10,
max_lines=20,
interactive=False
)
process_btn.click(
process_video,
inputs=video_input,
outputs=[process_status, transcript_output]
)
# Question answering section
with gr.Row():
gr.Markdown("## ❓ Step 3: Ask Questions")
with gr.Row():
with gr.Column():
question_input = gr.Textbox(
label="Your Question",
placeholder="What is this video about?",
lines=2
)
ask_btn = gr.Button("Ask Question", variant="secondary")
with gr.Column():
answer_output = gr.Textbox(
label="Answer",
lines=5,
interactive=False
)
ask_btn.click(
answer_question_interface,
inputs=[question_input, transcript_output],
outputs=answer_output
)
# Summarization section
with gr.Row():
gr.Markdown("## πŸ“ Step 4: Generate Summary")
with gr.Row():
with gr.Column():
summary_type = gr.Dropdown(
choices=["general", "detailed", "bullet_points"],
value="general",
label="Summary Type"
)
summarize_btn = gr.Button("Generate Summary", variant="secondary")
with gr.Column():
summary_output = gr.Textbox(
label="Summary",
lines=8,
interactive=False
)
summarize_btn.click(
summarize_interface,
inputs=[transcript_output, summary_type],
outputs=summary_output
)
# Instructions and tips
with gr.Row():
gr.Markdown("""
## πŸ’‘ Tips:
1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
2. **Audio quality**: Better audio quality leads to more accurate transcriptions
3. **Processing time**: Larger videos take longer to process
4. **Questions**: Be specific with your questions for better answers
5. **Summaries**: Choose the summary type that best fits your needs
## ⚠️ Requirements:
- PyTorch 2.6+ for FSDP2 support
- CUDA-compatible GPU recommended for optimal performance
- Sufficient disk space for temporary audio files
""")
return app
# Launch the application
if __name__ == "__main__":
app = create_interface()
app.launch(
share=True
)