Spaces:

shukdevdattaEX
/

Canary-Video-Chat

Running

App Files Files Community

Canary-Video-Chat / app.py

shukdevdattaEX

Create app.py

25e56df verified about 2 months ago

raw

history blame

13.5 kB

	import gradio as gr
	import os
	import tempfile
	import subprocess
	import librosa
	import soundfile as sf
	import torch
	from pathlib import Path
	import traceback
	from typing import List, Dict, Tuple, Optional

	# Install required packages
	def install_requirements():
	"""Install required packages if not already installed"""
	try:
	import nemo
	print("NeMo already installed")
	except ImportError:
	print("Installing NeMo...")
	subprocess.run([
	"pip", "install",
	"nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
	], check=True)

	try:
	import moviepy
	print("MoviePy already installed")
	except ImportError:
	print("Installing MoviePy...")
	subprocess.run(["pip", "install", "moviepy"], check=True)

	# Try to install requirements
	try:
	install_requirements()
	from nemo.collections.speechlm2.models import SALM
	import moviepy.editor as mp
	DEPENDENCIES_AVAILABLE = True
	except Exception as e:
	print(f"Warning: Could not install dependencies: {e}")
	DEPENDENCIES_AVAILABLE = False

	class VideoQASummarizer:
	def __init__(self):
	self.model = None
	self.current_transcript = ""
	self.model_loaded = False

	def load_model(self):
	"""Load the Canary-Qwen-2.5B model"""
	if not DEPENDENCIES_AVAILABLE:
	return "Error: Required dependencies not available. Please install manually."

	try:
	if self.model is None:
	print("Loading Canary-Qwen-2.5B model...")
	self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
	self.model_loaded = True
	return "Model loaded successfully!"
	return "Model already loaded."
	except Exception as e:
	error_msg = f"Error loading model: {str(e)}"
	print(error_msg)
	print(traceback.format_exc())
	return error_msg

	def extract_audio_from_video(self, video_path: str) -> str:
	"""Extract audio from video file"""
	try:
	# Create temporary audio file
	temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
	temp_audio_path = temp_audio.name
	temp_audio.close()

	# Load video and extract audio
	video = mp.VideoFileClip(video_path)
	audio = video.audio

	# Write audio to temporary file
	audio.write_audiofile(temp_audio_path, verbose=False, logger=None)

	# Clean up
	audio.close()
	video.close()

	return temp_audio_path
	except Exception as e:
	raise Exception(f"Error extracting audio: {str(e)}")

	def preprocess_audio(self, audio_path: str) -> str:
	"""Preprocess audio for the model (ensure correct format)"""
	try:
	# Load audio
	audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz if needed

	# Create new temporary file for processed audio
	temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
	temp_processed_path = temp_processed.name
	temp_processed.close()

	# Save processed audio
	sf.write(temp_processed_path, audio, 16000)

	return temp_processed_path
	except Exception as e:
	raise Exception(f"Error preprocessing audio: {str(e)}")

	def transcribe_audio(self, audio_path: str) -> str:
	"""Transcribe audio using Canary-Qwen-2.5B in ASR mode"""
	try:
	if not self.model_loaded:
	return "Error: Model not loaded. Please load the model first."

	# Preprocess audio
	processed_audio_path = self.preprocess_audio(audio_path)

	# Transcribe using ASR mode
	answer_ids = self.model.generate(
	prompts=[
	[{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
	],
	max_new_tokens=512,
	)

	transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())

	# Clean up temporary file
	os.unlink(processed_audio_path)

	return transcript.strip()
	except Exception as e:
	error_msg = f"Error during transcription: {str(e)}"
	print(error_msg)
	print(traceback.format_exc())
	return error_msg

	def answer_question(self, question: str, transcript: str) -> str:
	"""Answer questions about the transcript using LLM mode"""
	try:
	if not self.model_loaded:
	return "Error: Model not loaded. Please load the model first."

	if not transcript:
	return "Error: No transcript available. Please transcribe a video first."

	# Use LLM mode to answer questions
	prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"

	with self.model.llm.disable_adapter():
	answer_ids = self.model.generate(
	prompts=[[{"role": "user", "content": prompt}]],
	max_new_tokens=512,
	)

	answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
	return answer.strip()
	except Exception as e:
	error_msg = f"Error answering question: {str(e)}"
	print(error_msg)
	print(traceback.format_exc())
	return error_msg

	def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
	"""Summarize the transcript using LLM mode"""
	try:
	if not self.model_loaded:
	return "Error: Model not loaded. Please load the model first."

	if not transcript:
	return "Error: No transcript available. Please transcribe a video first."

	# Create different summary prompts based on type
	if summary_type == "bullet_points":
	prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
	elif summary_type == "detailed":
	prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
	else: # general
	prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"

	with self.model.llm.disable_adapter():
	answer_ids = self.model.generate(
	prompts=[[{"role": "user", "content": prompt}]],
	max_new_tokens=1024,
	)

	summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
	return summary.strip()
	except Exception as e:
	error_msg = f"Error creating summary: {str(e)}"
	print(error_msg)
	print(traceback.format_exc())
	return error_msg

	# Initialize the model
	qa_summarizer = VideoQASummarizer()

	def load_model_interface():
	"""Interface function to load the model"""
	return qa_summarizer.load_model()

	def process_video(video_file):
	"""Process uploaded video and return transcript"""
	if video_file is None:
	return "Please upload a video file.", ""

	try:
	# Extract audio from video
	status_msg = "Extracting audio from video..."
	audio_path = qa_summarizer.extract_audio_from_video(video_file)

	# Transcribe audio
	status_msg = "Transcribing audio..."
	transcript = qa_summarizer.transcribe_audio(audio_path)

	# Store transcript for later use
	qa_summarizer.current_transcript = transcript

	# Clean up temporary audio file
	if os.path.exists(audio_path):
	os.unlink(audio_path)

	return "Video processed successfully!", transcript
	except Exception as e:
	error_msg = f"Error processing video: {str(e)}"
	print(error_msg)
	print(traceback.format_exc())
	return error_msg, ""

	def answer_question_interface(question, transcript):
	"""Interface function to answer questions"""
	if not question.strip():
	return "Please enter a question."

	return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)

	def summarize_interface(transcript, summary_type):
	"""Interface function to create summaries"""
	return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🎥 Video Question Answering and Summarizer

	Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.

	Features:
	- Extract and transcribe audio from video files
	- Ask questions about the video content
	- Generate different types of summaries
	- Powered by NVIDIA NeMo Canary-Qwen-2.5B
	""")

	# Model loading section
	with gr.Row():
	gr.Markdown("## 🚀 Step 1: Load Model")

	with gr.Row():
	load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary")
	model_status = gr.Textbox(label="Model Status", interactive=False)

	load_btn.click(load_model_interface, outputs=model_status)

	# Video processing section
	with gr.Row():
	gr.Markdown("## 📹 Step 2: Upload and Process Video")

	with gr.Row():
	with gr.Column():
	video_input = gr.Video(label="Upload Video File")
	process_btn = gr.Button("Process Video", variant="primary")

	with gr.Column():
	process_status = gr.Textbox(label="Processing Status", interactive=False)
	transcript_output = gr.Textbox(
	label="Transcript",
	lines=10,
	max_lines=20,
	interactive=False
	)

	process_btn.click(
	process_video,
	inputs=video_input,
	outputs=[process_status, transcript_output]
	)

	# Question answering section
	with gr.Row():
	gr.Markdown("## ❓ Step 3: Ask Questions")

	with gr.Row():
	with gr.Column():
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="What is this video about?",
	lines=2
	)
	ask_btn = gr.Button("Ask Question", variant="secondary")

	with gr.Column():
	answer_output = gr.Textbox(
	label="Answer",
	lines=5,
	interactive=False
	)

	ask_btn.click(
	answer_question_interface,
	inputs=[question_input, transcript_output],
	outputs=answer_output
	)

	# Summarization section
	with gr.Row():
	gr.Markdown("## 📝 Step 4: Generate Summary")

	with gr.Row():
	with gr.Column():
	summary_type = gr.Dropdown(
	choices=["general", "detailed", "bullet_points"],
	value="general",
	label="Summary Type"
	)
	summarize_btn = gr.Button("Generate Summary", variant="secondary")

	with gr.Column():
	summary_output = gr.Textbox(
	label="Summary",
	lines=8,
	interactive=False
	)

	summarize_btn.click(
	summarize_interface,
	inputs=[transcript_output, summary_type],
	outputs=summary_output
	)

	# Instructions and tips
	with gr.Row():
	gr.Markdown("""
	## 💡 Tips:

	1. Supported formats: MP4, AVI, MOV, MKV, and other common video formats
	2. Audio quality: Better audio quality leads to more accurate transcriptions
	3. Processing time: Larger videos take longer to process
	4. Questions: Be specific with your questions for better answers
	5. Summaries: Choose the summary type that best fits your needs

	## ⚠️ Requirements:
	- PyTorch 2.6+ for FSDP2 support
	- CUDA-compatible GPU recommended for optimal performance
	- Sufficient disk space for temporary audio files
	""")

	return app

	# Launch the application
	if __name__ == "__main__":
	app = create_interface()
	app.launch(
	share=True
	)