Spaces:

shukdevdattaEX
/

Canary-Video-Chat

Running

App Files Files Community

shukdevdattaEX commited on Aug 1

Commit

8667e77

verified ·

1 Parent(s): be29904

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -492

app.py DELETED Viewed

@@ -1,492 +0,0 @@
-import gradio as gr
-import os
-import tempfile
-import subprocess
-import librosa
-import soundfile as sf
-import torch
-from pathlib import Path
-import traceback
-from typing import List, Dict, Tuple, Optional
-import time
-# Install required packages
-def install_requirements():
-    """Install required packages if not already installed"""
-    try:
-        import nemo
-        print("NeMo already installed")
-    except ImportError:
-        print("Installing NeMo...")
-        subprocess.run([
-            "pip", "install",
-            "nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
-        ], check=True)
-    try:
-        import moviepy
-        print("MoviePy already installed")
-    except ImportError:
-        print("Installing MoviePy...")
-        subprocess.run(["pip", "install", "moviepy"], check=True)
-# Try to install requirements
-try:
-    install_requirements()
-    from nemo.collections.speechlm2.models import SALM
-    import moviepy.editor as mp
-    DEPENDENCIES_AVAILABLE = True
-except Exception as e:
-    print(f"Warning: Could not install dependencies: {e}")
-    DEPENDENCIES_AVAILABLE = False
-class VideoQASummarizer:
-    def __init__(self):
-        self.model = None
-        self.current_transcript = ""
-        self.model_loaded = False
-    def load_model(self):
-        """Load the Canary-Qwen-2.5B model"""
-        if not DEPENDENCIES_AVAILABLE:
-            return "Error: Required dependencies not available. Please install manually."
-        try:
-            if self.model is None:
-                print("Loading Canary-Qwen-2.5B model...")
-                self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
-                self.model_loaded = True
-                return "Model loaded successfully!"
-            return "Model already loaded."
-        except Exception as e:
-            error_msg = f"Error loading model: {str(e)}"
-            print(error_msg)
-            print(traceback.format_exc())
-            return error_msg
-    def extract_audio_from_video(self, video_path: str) -> str:
-        """Extract audio from video file"""
-        try:
-            # Create temporary audio file
-            temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
-            temp_audio_path = temp_audio.name
-            temp_audio.close()
-            # Load video and extract audio
-            video = mp.VideoFileClip(video_path)
-            audio = video.audio
-            # Write audio to temporary file
-            audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
-            # Clean up
-            audio.close()
-            video.close()
-            return temp_audio_path
-        except Exception as e:
-            raise Exception(f"Error extracting audio: {str(e)}")
-    def split_audio_by_duration(self, audio_path: str, max_duration: int = 30) -> List[str]:
-        """Split long audio files into smaller chunks"""
-        try:
-            # Load audio to check duration
-            audio, sr = librosa.load(audio_path, sr=16000)
-            total_duration = len(audio) / sr
-            if total_duration <= max_duration:
-                return [audio_path]
-            # Split audio into chunks
-            chunk_paths = []
-            chunk_samples = max_duration * sr
-            for i in range(0, len(audio), chunk_samples):
-                chunk = audio[i:i + chunk_samples]
-                # Create temporary file for chunk
-                temp_chunk = tempfile.NamedTemporaryFile(delete=False, suffix=f'_chunk_{i//chunk_samples}.wav')
-                chunk_path = temp_chunk.name
-                temp_chunk.close()
-                # Save chunk
-                sf.write(chunk_path, chunk, sr)
-                chunk_paths.append(chunk_path)
-            return chunk_paths
-        except Exception as e:
-            raise Exception(f"Error splitting audio: {str(e)}")
-    def preprocess_audio(self, audio_path: str) -> str:
-        """Preprocess audio for the model (ensure correct format)"""
-        try:
-            # Load audio
-            audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz if needed
-            # Create new temporary file for processed audio
-            temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
-            temp_processed_path = temp_processed.name
-            temp_processed.close()
-            # Save processed audio
-            sf.write(temp_processed_path, audio, 16000)
-            return temp_processed_path
-        except Exception as e:
-            raise Exception(f"Error preprocessing audio: {str(e)}")
-    def transcribe_audio_chunk(self, audio_path: str) -> str:
-        """Transcribe a single audio chunk"""
-        try:
-            # Preprocess audio
-            processed_audio_path = self.preprocess_audio(audio_path)
-            # Transcribe using ASR mode with increased token limit
-            answer_ids = self.model.generate(
-                prompts=[
-                    [{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
-                ],
-                max_new_tokens=4096,  # Increased from 512 to handle longer content
-                temperature=0.1,      # Lower temperature for more consistent transcription
-                do_sample=True,
-            )
-            transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
-            # Clean up temporary file
-            os.unlink(processed_audio_path)
-            return transcript.strip()
-        except Exception as e:
-            raise Exception(f"Error transcribing chunk: {str(e)}")
-    def transcribe_audio(self, audio_path: str) -> str:
-        """Transcribe audio using Canary-Qwen-2.5B in ASR mode with chunking for long files"""
-        try:
-            if not self.model_loaded:
-                return "Error: Model not loaded. Please load the model first."
-            # Check audio duration and split if necessary
-            audio, sr = librosa.load(audio_path, sr=16000)
-            duration = len(audio) / sr
-            print(f"Audio duration: {duration:.2f} seconds")
-            if duration > 30:  # Split long audio files
-                print("Long audio detected, splitting into chunks...")
-                chunk_paths = self.split_audio_by_duration(audio_path, max_duration=30)
-                full_transcript = ""
-                for i, chunk_path in enumerate(chunk_paths):
-                    print(f"Transcribing chunk {i+1}/{len(chunk_paths)}")
-                    chunk_transcript = self.transcribe_audio_chunk(chunk_path)
-                    # Clean up chunk transcript (remove model artifacts)
-                    chunk_transcript = self.clean_transcript(chunk_transcript)
-                    if chunk_transcript:
-                        full_transcript += chunk_transcript + " "
-                    # Clean up chunk file if we created it
-                    if chunk_path != audio_path:
-                        os.unlink(chunk_path)
-                return full_transcript.strip()
-            else:
-                # Short audio, transcribe directly
-                transcript = self.transcribe_audio_chunk(audio_path)
-                return self.clean_transcript(transcript)
-        except Exception as e:
-            error_msg = f"Error during transcription: {str(e)}"
-            print(error_msg)
-            print(traceback.format_exc())
-            return error_msg
-    def clean_transcript(self, transcript: str) -> str:
-        """Clean up transcript by removing model artifacts and formatting issues"""
-        try:
-            # Remove common model artifacts
-            artifacts_to_remove = [
-                "Sure! Here's the transcription without the timestamps, written as a single paragraph:",
-                "Here's the transcription:",
-                "Transcription:",
-                "<|im_start|>",
-                "<|im_end|>",
-                "<audio>",
-                "</audio>",
-            ]
-            cleaned = transcript
-            for artifact in artifacts_to_remove:
-                cleaned = cleaned.replace(artifact, "")
-            # Remove extra whitespace and normalize
-            cleaned = " ".join(cleaned.split())
-            # Remove any leading/trailing punctuation issues
-            cleaned = cleaned.strip(" .,!?")
-            return cleaned
-        except Exception as e:
-            print(f"Error cleaning transcript: {e}")
-            return transcript
-    def answer_question(self, question: str, transcript: str) -> str:
-        """Answer questions about the transcript using LLM mode"""
-        try:
-            if not self.model_loaded:
-                return "Error: Model not loaded. Please load the model first."
-            if not transcript:
-                return "Error: No transcript available. Please transcribe a video first."
-            # Use LLM mode to answer questions
-            prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
-            with self.model.llm.disable_adapter():
-                answer_ids = self.model.generate(
-                    prompts=[[{"role": "user", "content": prompt}]],
-                    max_new_tokens=4096,  # Increased for longer answers
-                    temperature=0.3,
-                    do_sample=True,
-                )
-            answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
-            return answer.strip()
-        except Exception as e:
-            error_msg = f"Error answering question: {str(e)}"
-            print(error_msg)
-            print(traceback.format_exc())
-            return error_msg
-    def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
-        """Summarize the transcript using LLM mode"""
-        try:
-            if not self.model_loaded:
-                return "Error: Model not loaded. Please load the model first."
-            if not transcript:
-                return "Error: No transcript available. Please transcribe a video first."
-            # Create different summary prompts based on type
-            if summary_type == "bullet_points":
-                prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
-            elif summary_type == "detailed":
-                prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
-            else:  # general
-                prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
-            with self.model.llm.disable_adapter():
-                answer_ids = self.model.generate(
-                    prompts=[[{"role": "user", "content": prompt}]],
-                    max_new_tokens=4096,  # Increased for longer summaries
-                    temperature=0.3,
-                    do_sample=True,
-                )
-            summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
-            return summary.strip()
-        except Exception as e:
-            error_msg = f"Error creating summary: {str(e)}"
-            print(error_msg)
-            print(traceback.format_exc())
-            return error_msg
-# Initialize the model
-qa_summarizer = VideoQASummarizer()
-def load_model_interface():
-    """Interface function to load the model"""
-    return qa_summarizer.load_model()
-def process_video(video_file, progress=gr.Progress()):
-    """Process uploaded video and return transcript"""
-    if video_file is None:
-        return "Please upload a video file.", ""
-    try:
-        progress(0.1, desc="Extracting audio from video...")
-        # Extract audio from video
-        audio_path = qa_summarizer.extract_audio_from_video(video_file)
-        progress(0.3, desc="Analyzing audio duration...")
-        # Check audio duration for progress estimation
-        audio, sr = librosa.load(audio_path, sr=16000)
-        duration = len(audio) / sr
-        progress(0.4, desc="Starting transcription...")
-        # Transcribe audio
-        transcript = qa_summarizer.transcribe_audio(audio_path)
-        progress(0.9, desc="Finalizing transcript...")
-        # Store transcript for later use
-        qa_summarizer.current_transcript = transcript
-        # Clean up temporary audio file
-        if os.path.exists(audio_path):
-            os.unlink(audio_path)
-        progress(1.0, desc="Complete!")
-        return f"Video processed successfully! (Duration: {duration:.1f}s)", transcript
-    except Exception as e:
-        error_msg = f"Error processing video: {str(e)}"
-        print(error_msg)
-        print(traceback.format_exc())
-        return error_msg, ""
-def answer_question_interface(question, transcript):
-    """Interface function to answer questions"""
-    if not question.strip():
-        return "Please enter a question."
-    return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
-def summarize_interface(transcript, summary_type):
-    """Interface function to create summaries"""
-    return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
-# Create Gradio interface
-def create_interface():
-    css = """
-    .load-btn {
-        margin:auto;
-    }
-    """
-    with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Ocean(), css=css) as app:
-        gr.Markdown("""
-        # 🎥 Video Question Answering and Summarizer
-        Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
-        **Features:**
-        - Extract and transcribe audio from video files (handles long videos with chunking)
-        - Ask questions about the video content
-        - Generate different types of summaries
-        - Powered by NVIDIA NeMo Canary-Qwen-2.5B
-        """)
-        # Model loading section
-        with gr.Row():
-            gr.Markdown("## 🚀 Step 1: Load Model")
-        with gr.Row():
-            load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary", elem_classes=["load-btn"])
-            model_status = gr.Textbox(label="Model Status", interactive=False)
-        load_btn.click(load_model_interface, outputs=model_status)
-        # Video processing section
-        with gr.Row():
-            gr.Markdown("## 📹 Step 2: Upload and Process Video")
-        with gr.Row():
-            with gr.Column():
-                video_input = gr.Video(label="Upload Video File")
-                process_btn = gr.Button("Process Video", variant="primary")
-            with gr.Column():
-                process_status = gr.Textbox(label="Processing Status", interactive=False)
-                transcript_output = gr.Textbox(
-                    label="Transcript",
-                    lines=15,
-                    max_lines=25,
-                    interactive=False,
-                    show_copy_button=True
-                )
-        process_btn.click(
-            process_video,
-            inputs=video_input,
-            outputs=[process_status, transcript_output],
-            show_progress=True
-        )
-        # Question answering section
-        with gr.Row():
-            gr.Markdown("## ❓ Step 3: Ask Questions")
-        with gr.Row():
-            with gr.Column():
-                question_input = gr.Textbox(
-                    label="Your Question",
-                    placeholder="What is this video about?",
-                    lines=2
-                )
-                ask_btn = gr.Button("Ask Question", variant="secondary")
-            with gr.Column():
-                answer_output = gr.Textbox(
-                    label="Answer",
-                    lines=6,
-                    interactive=False,
-                    show_copy_button=True
-                )
-        ask_btn.click(
-            answer_question_interface,
-            inputs=[question_input, transcript_output],
-            outputs=answer_output
-        )
-        # Summarization section
-        with gr.Row():
-            gr.Markdown("## 📝 Step 4: Generate Summary")
-        with gr.Row():
-            with gr.Column():
-                summary_type = gr.Dropdown(
-                    choices=["general", "detailed", "bullet_points"],
-                    value="general",
-                    label="Summary Type"
-                )
-                summarize_btn = gr.Button("Generate Summary", variant="secondary")
-            with gr.Column():
-                summary_output = gr.Textbox(
-                    label="Summary",
-                    lines=10,
-                    interactive=False,
-                    show_copy_button=True
-                )
-        summarize_btn.click(
-            summarize_interface,
-            inputs=[transcript_output, summary_type],
-            outputs=summary_output
-        )
-        # Instructions and tips
-        with gr.Row():
-            gr.Markdown("""
-            ## 💡 Tips & Improvements:
-            1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
-            2. **Audio quality**: Better audio quality leads to more accurate transcriptions
-            3. **Long videos**: The app now automatically splits long audio files into chunks for complete transcription
-            4. **Processing time**: Longer videos are processed in chunks, which may take more time but ensures completeness
-            5. **Questions**: Be specific with your questions for better answers
-            6. **Summaries**: Choose the summary type that best fits your needs
-            ## 🔧 Recent Fixes:
-            - **Increased token limits** for complete transcriptions (2048 tokens for transcription, 1536 for summaries)
-            - **Audio chunking** for videos longer than 30 seconds to prevent cutoffs
-            - **Improved transcript cleaning** to remove model artifacts
-            - **Better progress tracking** during video processing
-            - **Copy buttons** for easy text copying
-            ## ⚠️ Requirements:
-            - PyTorch 2.6+ for FSDP2 support
-            - CUDA-compatible GPU recommended for optimal performance
-            - Sufficient disk space for temporary audio files
-            """)
-    return app
-# Launch the application
-if __name__ == "__main__":
-    app = create_interface()
-    app.launch(
-        share=True
-    )