shukdevdattaEX commited on
Commit
8667e77
·
verified ·
1 Parent(s): be29904

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -492
app.py DELETED
@@ -1,492 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import tempfile
4
- import subprocess
5
- import librosa
6
- import soundfile as sf
7
- import torch
8
- from pathlib import Path
9
- import traceback
10
- from typing import List, Dict, Tuple, Optional
11
- import time
12
-
13
- # Install required packages
14
- def install_requirements():
15
- """Install required packages if not already installed"""
16
- try:
17
- import nemo
18
- print("NeMo already installed")
19
- except ImportError:
20
- print("Installing NeMo...")
21
- subprocess.run([
22
- "pip", "install",
23
- "nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
24
- ], check=True)
25
-
26
- try:
27
- import moviepy
28
- print("MoviePy already installed")
29
- except ImportError:
30
- print("Installing MoviePy...")
31
- subprocess.run(["pip", "install", "moviepy"], check=True)
32
-
33
- # Try to install requirements
34
- try:
35
- install_requirements()
36
- from nemo.collections.speechlm2.models import SALM
37
- import moviepy.editor as mp
38
- DEPENDENCIES_AVAILABLE = True
39
- except Exception as e:
40
- print(f"Warning: Could not install dependencies: {e}")
41
- DEPENDENCIES_AVAILABLE = False
42
-
43
- class VideoQASummarizer:
44
- def __init__(self):
45
- self.model = None
46
- self.current_transcript = ""
47
- self.model_loaded = False
48
-
49
- def load_model(self):
50
- """Load the Canary-Qwen-2.5B model"""
51
- if not DEPENDENCIES_AVAILABLE:
52
- return "Error: Required dependencies not available. Please install manually."
53
-
54
- try:
55
- if self.model is None:
56
- print("Loading Canary-Qwen-2.5B model...")
57
- self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
58
- self.model_loaded = True
59
- return "Model loaded successfully!"
60
- return "Model already loaded."
61
- except Exception as e:
62
- error_msg = f"Error loading model: {str(e)}"
63
- print(error_msg)
64
- print(traceback.format_exc())
65
- return error_msg
66
-
67
- def extract_audio_from_video(self, video_path: str) -> str:
68
- """Extract audio from video file"""
69
- try:
70
- # Create temporary audio file
71
- temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
72
- temp_audio_path = temp_audio.name
73
- temp_audio.close()
74
-
75
- # Load video and extract audio
76
- video = mp.VideoFileClip(video_path)
77
- audio = video.audio
78
-
79
- # Write audio to temporary file
80
- audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
81
-
82
- # Clean up
83
- audio.close()
84
- video.close()
85
-
86
- return temp_audio_path
87
- except Exception as e:
88
- raise Exception(f"Error extracting audio: {str(e)}")
89
-
90
- def split_audio_by_duration(self, audio_path: str, max_duration: int = 30) -> List[str]:
91
- """Split long audio files into smaller chunks"""
92
- try:
93
- # Load audio to check duration
94
- audio, sr = librosa.load(audio_path, sr=16000)
95
- total_duration = len(audio) / sr
96
-
97
- if total_duration <= max_duration:
98
- return [audio_path]
99
-
100
- # Split audio into chunks
101
- chunk_paths = []
102
- chunk_samples = max_duration * sr
103
-
104
- for i in range(0, len(audio), chunk_samples):
105
- chunk = audio[i:i + chunk_samples]
106
-
107
- # Create temporary file for chunk
108
- temp_chunk = tempfile.NamedTemporaryFile(delete=False, suffix=f'_chunk_{i//chunk_samples}.wav')
109
- chunk_path = temp_chunk.name
110
- temp_chunk.close()
111
-
112
- # Save chunk
113
- sf.write(chunk_path, chunk, sr)
114
- chunk_paths.append(chunk_path)
115
-
116
- return chunk_paths
117
- except Exception as e:
118
- raise Exception(f"Error splitting audio: {str(e)}")
119
-
120
- def preprocess_audio(self, audio_path: str) -> str:
121
- """Preprocess audio for the model (ensure correct format)"""
122
- try:
123
- # Load audio
124
- audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz if needed
125
-
126
- # Create new temporary file for processed audio
127
- temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
128
- temp_processed_path = temp_processed.name
129
- temp_processed.close()
130
-
131
- # Save processed audio
132
- sf.write(temp_processed_path, audio, 16000)
133
-
134
- return temp_processed_path
135
- except Exception as e:
136
- raise Exception(f"Error preprocessing audio: {str(e)}")
137
-
138
- def transcribe_audio_chunk(self, audio_path: str) -> str:
139
- """Transcribe a single audio chunk"""
140
- try:
141
- # Preprocess audio
142
- processed_audio_path = self.preprocess_audio(audio_path)
143
-
144
- # Transcribe using ASR mode with increased token limit
145
- answer_ids = self.model.generate(
146
- prompts=[
147
- [{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
148
- ],
149
- max_new_tokens=4096, # Increased from 512 to handle longer content
150
- temperature=0.1, # Lower temperature for more consistent transcription
151
- do_sample=True,
152
- )
153
-
154
- transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
155
-
156
- # Clean up temporary file
157
- os.unlink(processed_audio_path)
158
-
159
- return transcript.strip()
160
- except Exception as e:
161
- raise Exception(f"Error transcribing chunk: {str(e)}")
162
-
163
- def transcribe_audio(self, audio_path: str) -> str:
164
- """Transcribe audio using Canary-Qwen-2.5B in ASR mode with chunking for long files"""
165
- try:
166
- if not self.model_loaded:
167
- return "Error: Model not loaded. Please load the model first."
168
-
169
- # Check audio duration and split if necessary
170
- audio, sr = librosa.load(audio_path, sr=16000)
171
- duration = len(audio) / sr
172
- print(f"Audio duration: {duration:.2f} seconds")
173
-
174
- if duration > 30: # Split long audio files
175
- print("Long audio detected, splitting into chunks...")
176
- chunk_paths = self.split_audio_by_duration(audio_path, max_duration=30)
177
-
178
- full_transcript = ""
179
- for i, chunk_path in enumerate(chunk_paths):
180
- print(f"Transcribing chunk {i+1}/{len(chunk_paths)}")
181
- chunk_transcript = self.transcribe_audio_chunk(chunk_path)
182
-
183
- # Clean up chunk transcript (remove model artifacts)
184
- chunk_transcript = self.clean_transcript(chunk_transcript)
185
-
186
- if chunk_transcript:
187
- full_transcript += chunk_transcript + " "
188
-
189
- # Clean up chunk file if we created it
190
- if chunk_path != audio_path:
191
- os.unlink(chunk_path)
192
-
193
- return full_transcript.strip()
194
- else:
195
- # Short audio, transcribe directly
196
- transcript = self.transcribe_audio_chunk(audio_path)
197
- return self.clean_transcript(transcript)
198
-
199
- except Exception as e:
200
- error_msg = f"Error during transcription: {str(e)}"
201
- print(error_msg)
202
- print(traceback.format_exc())
203
- return error_msg
204
-
205
- def clean_transcript(self, transcript: str) -> str:
206
- """Clean up transcript by removing model artifacts and formatting issues"""
207
- try:
208
- # Remove common model artifacts
209
- artifacts_to_remove = [
210
- "Sure! Here's the transcription without the timestamps, written as a single paragraph:",
211
- "Here's the transcription:",
212
- "Transcription:",
213
- "<|im_start|>",
214
- "<|im_end|>",
215
- "<audio>",
216
- "</audio>",
217
- ]
218
-
219
- cleaned = transcript
220
- for artifact in artifacts_to_remove:
221
- cleaned = cleaned.replace(artifact, "")
222
-
223
- # Remove extra whitespace and normalize
224
- cleaned = " ".join(cleaned.split())
225
-
226
- # Remove any leading/trailing punctuation issues
227
- cleaned = cleaned.strip(" .,!?")
228
-
229
- return cleaned
230
- except Exception as e:
231
- print(f"Error cleaning transcript: {e}")
232
- return transcript
233
-
234
- def answer_question(self, question: str, transcript: str) -> str:
235
- """Answer questions about the transcript using LLM mode"""
236
- try:
237
- if not self.model_loaded:
238
- return "Error: Model not loaded. Please load the model first."
239
-
240
- if not transcript:
241
- return "Error: No transcript available. Please transcribe a video first."
242
-
243
- # Use LLM mode to answer questions
244
- prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
245
-
246
- with self.model.llm.disable_adapter():
247
- answer_ids = self.model.generate(
248
- prompts=[[{"role": "user", "content": prompt}]],
249
- max_new_tokens=4096, # Increased for longer answers
250
- temperature=0.3,
251
- do_sample=True,
252
- )
253
-
254
- answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
255
- return answer.strip()
256
- except Exception as e:
257
- error_msg = f"Error answering question: {str(e)}"
258
- print(error_msg)
259
- print(traceback.format_exc())
260
- return error_msg
261
-
262
- def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
263
- """Summarize the transcript using LLM mode"""
264
- try:
265
- if not self.model_loaded:
266
- return "Error: Model not loaded. Please load the model first."
267
-
268
- if not transcript:
269
- return "Error: No transcript available. Please transcribe a video first."
270
-
271
- # Create different summary prompts based on type
272
- if summary_type == "bullet_points":
273
- prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
274
- elif summary_type == "detailed":
275
- prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
276
- else: # general
277
- prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
278
-
279
- with self.model.llm.disable_adapter():
280
- answer_ids = self.model.generate(
281
- prompts=[[{"role": "user", "content": prompt}]],
282
- max_new_tokens=4096, # Increased for longer summaries
283
- temperature=0.3,
284
- do_sample=True,
285
- )
286
-
287
- summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
288
- return summary.strip()
289
- except Exception as e:
290
- error_msg = f"Error creating summary: {str(e)}"
291
- print(error_msg)
292
- print(traceback.format_exc())
293
- return error_msg
294
-
295
- # Initialize the model
296
- qa_summarizer = VideoQASummarizer()
297
-
298
- def load_model_interface():
299
- """Interface function to load the model"""
300
- return qa_summarizer.load_model()
301
-
302
- def process_video(video_file, progress=gr.Progress()):
303
- """Process uploaded video and return transcript"""
304
- if video_file is None:
305
- return "Please upload a video file.", ""
306
-
307
- try:
308
- progress(0.1, desc="Extracting audio from video...")
309
- # Extract audio from video
310
- audio_path = qa_summarizer.extract_audio_from_video(video_file)
311
-
312
- progress(0.3, desc="Analyzing audio duration...")
313
- # Check audio duration for progress estimation
314
- audio, sr = librosa.load(audio_path, sr=16000)
315
- duration = len(audio) / sr
316
-
317
- progress(0.4, desc="Starting transcription...")
318
- # Transcribe audio
319
- transcript = qa_summarizer.transcribe_audio(audio_path)
320
-
321
- progress(0.9, desc="Finalizing transcript...")
322
- # Store transcript for later use
323
- qa_summarizer.current_transcript = transcript
324
-
325
- # Clean up temporary audio file
326
- if os.path.exists(audio_path):
327
- os.unlink(audio_path)
328
-
329
- progress(1.0, desc="Complete!")
330
- return f"Video processed successfully! (Duration: {duration:.1f}s)", transcript
331
- except Exception as e:
332
- error_msg = f"Error processing video: {str(e)}"
333
- print(error_msg)
334
- print(traceback.format_exc())
335
- return error_msg, ""
336
-
337
- def answer_question_interface(question, transcript):
338
- """Interface function to answer questions"""
339
- if not question.strip():
340
- return "Please enter a question."
341
-
342
- return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
343
-
344
- def summarize_interface(transcript, summary_type):
345
- """Interface function to create summaries"""
346
- return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
347
-
348
- # Create Gradio interface
349
- def create_interface():
350
-
351
- css = """
352
- .load-btn {
353
- margin:auto;
354
- }
355
- """
356
-
357
- with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Ocean(), css=css) as app:
358
- gr.Markdown("""
359
- # 🎥 Video Question Answering and Summarizer
360
-
361
- Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
362
-
363
- **Features:**
364
- - Extract and transcribe audio from video files (handles long videos with chunking)
365
- - Ask questions about the video content
366
- - Generate different types of summaries
367
- - Powered by NVIDIA NeMo Canary-Qwen-2.5B
368
- """)
369
-
370
- # Model loading section
371
- with gr.Row():
372
- gr.Markdown("## 🚀 Step 1: Load Model")
373
-
374
- with gr.Row():
375
- load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary", elem_classes=["load-btn"])
376
- model_status = gr.Textbox(label="Model Status", interactive=False)
377
-
378
- load_btn.click(load_model_interface, outputs=model_status)
379
-
380
- # Video processing section
381
- with gr.Row():
382
- gr.Markdown("## 📹 Step 2: Upload and Process Video")
383
-
384
- with gr.Row():
385
- with gr.Column():
386
- video_input = gr.Video(label="Upload Video File")
387
- process_btn = gr.Button("Process Video", variant="primary")
388
-
389
- with gr.Column():
390
- process_status = gr.Textbox(label="Processing Status", interactive=False)
391
- transcript_output = gr.Textbox(
392
- label="Transcript",
393
- lines=15,
394
- max_lines=25,
395
- interactive=False,
396
- show_copy_button=True
397
- )
398
-
399
- process_btn.click(
400
- process_video,
401
- inputs=video_input,
402
- outputs=[process_status, transcript_output],
403
- show_progress=True
404
- )
405
-
406
- # Question answering section
407
- with gr.Row():
408
- gr.Markdown("## ❓ Step 3: Ask Questions")
409
-
410
- with gr.Row():
411
- with gr.Column():
412
- question_input = gr.Textbox(
413
- label="Your Question",
414
- placeholder="What is this video about?",
415
- lines=2
416
- )
417
- ask_btn = gr.Button("Ask Question", variant="secondary")
418
-
419
- with gr.Column():
420
- answer_output = gr.Textbox(
421
- label="Answer",
422
- lines=6,
423
- interactive=False,
424
- show_copy_button=True
425
- )
426
-
427
- ask_btn.click(
428
- answer_question_interface,
429
- inputs=[question_input, transcript_output],
430
- outputs=answer_output
431
- )
432
-
433
- # Summarization section
434
- with gr.Row():
435
- gr.Markdown("## 📝 Step 4: Generate Summary")
436
-
437
- with gr.Row():
438
- with gr.Column():
439
- summary_type = gr.Dropdown(
440
- choices=["general", "detailed", "bullet_points"],
441
- value="general",
442
- label="Summary Type"
443
- )
444
- summarize_btn = gr.Button("Generate Summary", variant="secondary")
445
-
446
- with gr.Column():
447
- summary_output = gr.Textbox(
448
- label="Summary",
449
- lines=10,
450
- interactive=False,
451
- show_copy_button=True
452
- )
453
-
454
- summarize_btn.click(
455
- summarize_interface,
456
- inputs=[transcript_output, summary_type],
457
- outputs=summary_output
458
- )
459
-
460
- # Instructions and tips
461
- with gr.Row():
462
- gr.Markdown("""
463
- ## 💡 Tips & Improvements:
464
-
465
- 1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
466
- 2. **Audio quality**: Better audio quality leads to more accurate transcriptions
467
- 3. **Long videos**: The app now automatically splits long audio files into chunks for complete transcription
468
- 4. **Processing time**: Longer videos are processed in chunks, which may take more time but ensures completeness
469
- 5. **Questions**: Be specific with your questions for better answers
470
- 6. **Summaries**: Choose the summary type that best fits your needs
471
-
472
- ## 🔧 Recent Fixes:
473
- - **Increased token limits** for complete transcriptions (2048 tokens for transcription, 1536 for summaries)
474
- - **Audio chunking** for videos longer than 30 seconds to prevent cutoffs
475
- - **Improved transcript cleaning** to remove model artifacts
476
- - **Better progress tracking** during video processing
477
- - **Copy buttons** for easy text copying
478
-
479
- ## ⚠️ Requirements:
480
- - PyTorch 2.6+ for FSDP2 support
481
- - CUDA-compatible GPU recommended for optimal performance
482
- - Sufficient disk space for temporary audio files
483
- """)
484
-
485
- return app
486
-
487
- # Launch the application
488
- if __name__ == "__main__":
489
- app = create_interface()
490
- app.launch(
491
- share=True
492
- )