|
import gradio as gr |
|
import os |
|
import tempfile |
|
import subprocess |
|
import librosa |
|
import soundfile as sf |
|
import torch |
|
from pathlib import Path |
|
import traceback |
|
from typing import List, Dict, Tuple, Optional |
|
|
|
|
|
def install_requirements(): |
|
"""Install required packages if not already installed""" |
|
try: |
|
import nemo |
|
print("NeMo already installed") |
|
except ImportError: |
|
print("Installing NeMo...") |
|
subprocess.run([ |
|
"pip", "install", |
|
"nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git" |
|
], check=True) |
|
|
|
try: |
|
import moviepy |
|
print("MoviePy already installed") |
|
except ImportError: |
|
print("Installing MoviePy...") |
|
subprocess.run(["pip", "install", "moviepy"], check=True) |
|
|
|
|
|
try: |
|
install_requirements() |
|
from nemo.collections.speechlm2.models import SALM |
|
import moviepy.editor as mp |
|
DEPENDENCIES_AVAILABLE = True |
|
except Exception as e: |
|
print(f"Warning: Could not install dependencies: {e}") |
|
DEPENDENCIES_AVAILABLE = False |
|
|
|
class VideoQASummarizer: |
|
def __init__(self): |
|
self.model = None |
|
self.current_transcript = "" |
|
self.model_loaded = False |
|
|
|
def load_model(self): |
|
"""Load the Canary-Qwen-2.5B model""" |
|
if not DEPENDENCIES_AVAILABLE: |
|
return "Error: Required dependencies not available. Please install manually." |
|
|
|
try: |
|
if self.model is None: |
|
print("Loading Canary-Qwen-2.5B model...") |
|
self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b') |
|
self.model_loaded = True |
|
return "Model loaded successfully!" |
|
return "Model already loaded." |
|
except Exception as e: |
|
error_msg = f"Error loading model: {str(e)}" |
|
print(error_msg) |
|
print(traceback.format_exc()) |
|
return error_msg |
|
|
|
def extract_audio_from_video(self, video_path: str) -> str: |
|
"""Extract audio from video file""" |
|
try: |
|
|
|
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') |
|
temp_audio_path = temp_audio.name |
|
temp_audio.close() |
|
|
|
|
|
video = mp.VideoFileClip(video_path) |
|
audio = video.audio |
|
|
|
|
|
audio.write_audiofile(temp_audio_path, verbose=False, logger=None) |
|
|
|
|
|
audio.close() |
|
video.close() |
|
|
|
return temp_audio_path |
|
except Exception as e: |
|
raise Exception(f"Error extracting audio: {str(e)}") |
|
|
|
def preprocess_audio(self, audio_path: str) -> str: |
|
"""Preprocess audio for the model (ensure correct format)""" |
|
try: |
|
|
|
audio, sr = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') |
|
temp_processed_path = temp_processed.name |
|
temp_processed.close() |
|
|
|
|
|
sf.write(temp_processed_path, audio, 16000) |
|
|
|
return temp_processed_path |
|
except Exception as e: |
|
raise Exception(f"Error preprocessing audio: {str(e)}") |
|
|
|
def transcribe_audio(self, audio_path: str) -> str: |
|
"""Transcribe audio using Canary-Qwen-2.5B in ASR mode""" |
|
try: |
|
if not self.model_loaded: |
|
return "Error: Model not loaded. Please load the model first." |
|
|
|
|
|
processed_audio_path = self.preprocess_audio(audio_path) |
|
|
|
|
|
answer_ids = self.model.generate( |
|
prompts=[ |
|
[{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}] |
|
], |
|
max_new_tokens=512, |
|
) |
|
|
|
transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu()) |
|
|
|
|
|
os.unlink(processed_audio_path) |
|
|
|
return transcript.strip() |
|
except Exception as e: |
|
error_msg = f"Error during transcription: {str(e)}" |
|
print(error_msg) |
|
print(traceback.format_exc()) |
|
return error_msg |
|
|
|
def answer_question(self, question: str, transcript: str) -> str: |
|
"""Answer questions about the transcript using LLM mode""" |
|
try: |
|
if not self.model_loaded: |
|
return "Error: Model not loaded. Please load the model first." |
|
|
|
if not transcript: |
|
return "Error: No transcript available. Please transcribe a video first." |
|
|
|
|
|
prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}" |
|
|
|
with self.model.llm.disable_adapter(): |
|
answer_ids = self.model.generate( |
|
prompts=[[{"role": "user", "content": prompt}]], |
|
max_new_tokens=512, |
|
) |
|
|
|
answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu()) |
|
return answer.strip() |
|
except Exception as e: |
|
error_msg = f"Error answering question: {str(e)}" |
|
print(error_msg) |
|
print(traceback.format_exc()) |
|
return error_msg |
|
|
|
def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str: |
|
"""Summarize the transcript using LLM mode""" |
|
try: |
|
if not self.model_loaded: |
|
return "Error: Model not loaded. Please load the model first." |
|
|
|
if not transcript: |
|
return "Error: No transcript available. Please transcribe a video first." |
|
|
|
|
|
if summary_type == "bullet_points": |
|
prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}" |
|
elif summary_type == "detailed": |
|
prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}" |
|
else: |
|
prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}" |
|
|
|
with self.model.llm.disable_adapter(): |
|
answer_ids = self.model.generate( |
|
prompts=[[{"role": "user", "content": prompt}]], |
|
max_new_tokens=1024, |
|
) |
|
|
|
summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu()) |
|
return summary.strip() |
|
except Exception as e: |
|
error_msg = f"Error creating summary: {str(e)}" |
|
print(error_msg) |
|
print(traceback.format_exc()) |
|
return error_msg |
|
|
|
|
|
qa_summarizer = VideoQASummarizer() |
|
|
|
def load_model_interface(): |
|
"""Interface function to load the model""" |
|
return qa_summarizer.load_model() |
|
|
|
def process_video(video_file): |
|
"""Process uploaded video and return transcript""" |
|
if video_file is None: |
|
return "Please upload a video file.", "" |
|
|
|
try: |
|
|
|
status_msg = "Extracting audio from video..." |
|
audio_path = qa_summarizer.extract_audio_from_video(video_file) |
|
|
|
|
|
status_msg = "Transcribing audio..." |
|
transcript = qa_summarizer.transcribe_audio(audio_path) |
|
|
|
|
|
qa_summarizer.current_transcript = transcript |
|
|
|
|
|
if os.path.exists(audio_path): |
|
os.unlink(audio_path) |
|
|
|
return "Video processed successfully!", transcript |
|
except Exception as e: |
|
error_msg = f"Error processing video: {str(e)}" |
|
print(error_msg) |
|
print(traceback.format_exc()) |
|
return error_msg, "" |
|
|
|
def answer_question_interface(question, transcript): |
|
"""Interface function to answer questions""" |
|
if not question.strip(): |
|
return "Please enter a question." |
|
|
|
return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript) |
|
|
|
def summarize_interface(transcript, summary_type): |
|
"""Interface function to create summaries""" |
|
return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type) |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app: |
|
gr.Markdown(""" |
|
# π₯ Video Question Answering and Summarizer |
|
|
|
Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model. |
|
|
|
**Features:** |
|
- Extract and transcribe audio from video files |
|
- Ask questions about the video content |
|
- Generate different types of summaries |
|
- Powered by NVIDIA NeMo Canary-Qwen-2.5B |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown("## π Step 1: Load Model") |
|
|
|
with gr.Row(): |
|
load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary") |
|
model_status = gr.Textbox(label="Model Status", interactive=False) |
|
|
|
load_btn.click(load_model_interface, outputs=model_status) |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown("## πΉ Step 2: Upload and Process Video") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
video_input = gr.Video(label="Upload Video File") |
|
process_btn = gr.Button("Process Video", variant="primary") |
|
|
|
with gr.Column(): |
|
process_status = gr.Textbox(label="Processing Status", interactive=False) |
|
transcript_output = gr.Textbox( |
|
label="Transcript", |
|
lines=10, |
|
max_lines=20, |
|
interactive=False |
|
) |
|
|
|
process_btn.click( |
|
process_video, |
|
inputs=video_input, |
|
outputs=[process_status, transcript_output] |
|
) |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown("## β Step 3: Ask Questions") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
question_input = gr.Textbox( |
|
label="Your Question", |
|
placeholder="What is this video about?", |
|
lines=2 |
|
) |
|
ask_btn = gr.Button("Ask Question", variant="secondary") |
|
|
|
with gr.Column(): |
|
answer_output = gr.Textbox( |
|
label="Answer", |
|
lines=5, |
|
interactive=False |
|
) |
|
|
|
ask_btn.click( |
|
answer_question_interface, |
|
inputs=[question_input, transcript_output], |
|
outputs=answer_output |
|
) |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown("## π Step 4: Generate Summary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
summary_type = gr.Dropdown( |
|
choices=["general", "detailed", "bullet_points"], |
|
value="general", |
|
label="Summary Type" |
|
) |
|
summarize_btn = gr.Button("Generate Summary", variant="secondary") |
|
|
|
with gr.Column(): |
|
summary_output = gr.Textbox( |
|
label="Summary", |
|
lines=8, |
|
interactive=False |
|
) |
|
|
|
summarize_btn.click( |
|
summarize_interface, |
|
inputs=[transcript_output, summary_type], |
|
outputs=summary_output |
|
) |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown(""" |
|
## π‘ Tips: |
|
|
|
1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats |
|
2. **Audio quality**: Better audio quality leads to more accurate transcriptions |
|
3. **Processing time**: Larger videos take longer to process |
|
4. **Questions**: Be specific with your questions for better answers |
|
5. **Summaries**: Choose the summary type that best fits your needs |
|
|
|
## β οΈ Requirements: |
|
- PyTorch 2.6+ for FSDP2 support |
|
- CUDA-compatible GPU recommended for optimal performance |
|
- Sufficient disk space for temporary audio files |
|
""") |
|
|
|
return app |
|
|
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch( |
|
share=True |
|
) |