capradeepgujaran commited on
Commit
e25cab4
·
verified ·
1 Parent(s): ff87c41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -171
app.py CHANGED
@@ -1,6 +1,11 @@
1
  import cv2
2
  import numpy as np
3
- from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
 
 
 
 
 
4
  import torch
5
  from PIL import Image
6
  import faiss
@@ -11,233 +16,247 @@ import tempfile
11
  import os
12
  import shutil
13
  from tqdm import tqdm
14
- import math
15
 
16
- class VideoRAGTool:
17
- def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
18
- blip_model_name: str = "Salesforce/blip-image-captioning-base"):
19
- """Initialize with performance optimizations."""
20
- # Setup logger first to avoid the attribute error
21
  self.logger = self.setup_logger()
22
-
23
- self.logger.info("Initializing VideoRAGTool...")
24
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  self.logger.info(f"Using device: {self.device}")
26
-
27
- # Initialize models with optimization flags
28
- self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
29
- self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
30
- self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
31
- self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
32
-
33
- # Enable eval mode for inference
34
- self.clip_model.eval()
35
- self.blip_model.eval()
36
-
37
- # Batch processing settings
38
- self.batch_size = 4 # Reduced batch size for better memory management
39
-
 
 
 
 
 
40
  self.frame_index = None
41
  self.frame_data = []
 
 
 
 
 
 
 
42
 
43
  def setup_logger(self) -> logging.Logger:
44
- """Set up logging configuration."""
45
- logger = logging.getLogger('VideoRAGTool')
46
-
47
- # Clear any existing handlers
48
  if logger.handlers:
49
  logger.handlers.clear()
50
-
51
  logger.setLevel(logging.INFO)
52
  handler = logging.StreamHandler()
53
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
54
  handler.setFormatter(formatter)
55
  logger.addHandler(handler)
56
  return logger
57
 
58
  @torch.no_grad()
59
- def generate_caption(self, image: Image.Image) -> str:
60
- """Optimized caption generation."""
61
  try:
62
- inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
63
- out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
64
- caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
65
- return caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
- self.logger.error(f"Error generating caption: {str(e)}")
68
- return "Caption generation failed"
69
 
70
- def get_video_info(self, video_path: str) -> Tuple[int, float]:
71
- """Get video frame count and FPS."""
72
  cap = cv2.VideoCapture(video_path)
73
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
74
  fps = cap.get(cv2.CAP_PROP_FPS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  cap.release()
76
- return total_frames, fps
77
-
78
- def preprocess_frame(self, frame: np.ndarray, target_size: Tuple[int, int] = (224, 224)) -> Image.Image:
79
- """Preprocess frame with resizing for efficiency."""
80
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
81
- image = Image.fromarray(frame_rgb)
82
- return image.resize(target_size, Image.LANCZOS)
83
 
84
  @torch.no_grad()
85
- def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
86
- """Process a batch of frames efficiently."""
87
- try:
88
- # CLIP processing
89
- clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
90
- image_features = self.clip_model.get_image_features(**clip_inputs)
91
-
92
- # BLIP processing
93
- captions = []
94
- for frame in frames:
95
- caption = self.generate_caption(frame)
96
- captions.append(caption)
97
-
98
- return image_features.cpu().numpy(), captions
99
- except Exception as e:
100
- self.logger.error(f"Error processing batch: {str(e)}")
101
- raise
102
-
103
- def process_video(self, video_path: str, frame_interval: int = 30) -> None:
104
- """Optimized video processing with batching and progress tracking."""
105
  self.logger.info(f"Processing video: {video_path}")
 
 
106
 
107
  try:
108
- total_frames, fps = self.get_video_info(video_path)
109
- cap = cv2.VideoCapture(video_path)
110
-
111
- # Calculate total batches for progress bar
112
- frames_to_process = total_frames // frame_interval
113
- total_batches = math.ceil(frames_to_process / self.batch_size)
114
-
115
- current_batch = []
116
- features_list = []
117
- frame_count = 0
118
 
119
- with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
120
- while cap.isOpened():
121
- ret, frame = cap.read()
122
- if not ret:
123
- break
 
124
 
125
- if frame_count % frame_interval == 0:
126
- # Preprocess frame
127
- processed_frame = self.preprocess_frame(frame)
128
- current_batch.append(processed_frame)
129
-
130
- # Process batch when it reaches batch_size
131
- if len(current_batch) == self.batch_size:
132
- batch_features, batch_captions = self.process_batch(current_batch)
133
-
134
- # Store results
135
- for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
136
- batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
137
- self.frame_data.append({
138
- 'frame_number': batch_frame_number,
139
- 'timestamp': batch_frame_number / fps,
140
- 'caption': caption
141
- })
142
- features_list.append(features)
143
-
144
- current_batch = []
145
- pbar.update(self.batch_size)
146
 
147
- frame_count += 1
148
-
149
- # Process remaining frames
150
- if current_batch:
151
- batch_features, batch_captions = self.process_batch(current_batch)
152
- for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
153
- batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
154
- self.frame_data.append({
155
- 'frame_number': batch_frame_number,
156
- 'timestamp': batch_frame_number / fps,
157
- 'caption': caption
158
- })
159
- features_list.append(features)
160
-
161
- cap.release()
162
-
163
- if not features_list:
164
- raise ValueError("No frames were processed from the video")
165
-
166
  # Create FAISS index
167
- features_array = np.vstack(features_list)
168
- self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
169
- self.frame_index.add(features_array)
170
-
171
- self.logger.info(f"Processed {len(self.frame_data)} frames from video")
 
172
 
173
  except Exception as e:
174
  self.logger.error(f"Error processing video: {str(e)}")
175
  raise
176
 
177
- def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
178
- """Query the video using natural language and return relevant frames."""
179
- self.logger.info(f"Processing query: {query_text}")
180
-
181
  try:
182
- inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
183
- text_features = self.clip_model.get_text_features(**inputs)
 
184
 
 
185
  distances, indices = self.frame_index.search(
186
- text_features.cpu().detach().numpy(),
187
  k
188
  )
189
 
 
190
  results = []
191
- for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
192
  frame_info = self.frame_data[idx].copy()
 
 
193
  frame_info['relevance_score'] = float(1 / (1 + distance))
194
- results.append(frame_info)
195
 
 
 
 
 
 
 
 
196
  return results
 
197
  except Exception as e:
198
  self.logger.error(f"Error querying video: {str(e)}")
199
  raise
200
 
201
- class VideoRAGApp:
202
  def __init__(self):
203
- self.rag_tool = VideoRAGTool()
204
  self.current_video_path = None
205
  self.processed = False
206
  self.temp_dir = tempfile.mkdtemp()
207
 
208
  def __del__(self):
209
- """Cleanup temporary files on deletion"""
210
  if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
211
  shutil.rmtree(self.temp_dir, ignore_errors=True)
212
 
213
  def process_video(self, video_file):
214
- """Process uploaded video and return status message"""
215
  try:
216
  if video_file is None:
217
- return "Please upload a video first."
218
 
219
  video_path = video_file.name
220
  temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
221
  shutil.copy2(video_path, temp_video_path)
222
 
223
  self.current_video_path = temp_video_path
224
-
225
- self.rag_tool.process_video(self.current_video_path)
226
  self.processed = True
227
- return "Video processed successfully! You can now ask questions about the video."
 
228
 
229
  except Exception as e:
230
  self.processed = False
231
- return f"Error processing video: {str(e)}"
232
 
233
  def query_video(self, query_text):
234
- """Query the video and return relevant frames with descriptions"""
235
  if not self.processed:
236
  return None, "Please process a video first."
237
 
238
  try:
239
- results = self.rag_tool.query_video(query_text, k=4)
240
-
241
  frames = []
242
  descriptions = []
243
 
@@ -254,13 +273,14 @@ class VideoRAGApp:
254
 
255
  description = f"Timestamp: {result['timestamp']:.2f}s\n"
256
  description += f"Scene Description: {result['caption']}\n"
 
 
257
  description += f"Relevance Score: {result['relevance_score']:.2f}"
258
  descriptions.append(description)
259
 
260
  cap.release()
261
 
262
- # Combine all descriptions with frame numbers
263
- combined_description = "\n\nFrame Analysis:\n\n"
264
  for i, desc in enumerate(descriptions, 1):
265
  combined_description += f"Frame {i}:\n{desc}\n\n"
266
 
@@ -270,42 +290,43 @@ class VideoRAGApp:
270
  return None, f"Error querying video: {str(e)}"
271
 
272
  def create_interface(self):
273
- """Create and return Gradio interface"""
274
- with gr.Blocks(title="Video Chat RAG") as interface:
275
- gr.Markdown("# Video Chat RAG")
276
- gr.Markdown("Upload a video and ask questions about its content!")
277
 
278
  with gr.Row():
279
  video_input = gr.File(
280
- label="Upload Video",
281
  file_types=["video"],
282
  )
283
  process_button = gr.Button("Process Video")
284
 
285
- status_output = gr.Textbox(
286
- label="Status",
287
- interactive=False
288
- )
 
 
289
 
290
  with gr.Row():
291
  query_input = gr.Textbox(
292
- label="Ask about the video",
293
  placeholder="What's happening in the video?"
294
  )
295
  query_button = gr.Button("Search")
296
 
297
- with gr.Row():
298
- gallery = gr.Gallery(
299
- label="Retrieved Frames",
300
- show_label=True,
301
- elem_id="gallery",
302
- columns=[2],
303
- rows=[2],
304
- height="auto"
305
- )
306
 
307
  descriptions = gr.Textbox(
308
- label="Scene Descriptions",
309
  interactive=False,
310
  lines=10
311
  )
@@ -313,7 +334,7 @@ class VideoRAGApp:
313
  process_button.click(
314
  fn=self.process_video,
315
  inputs=[video_input],
316
- outputs=[status_output]
317
  )
318
 
319
  query_button.click(
@@ -325,7 +346,7 @@ class VideoRAGApp:
325
  return interface
326
 
327
  # Initialize and create the interface
328
- app = VideoRAGApp()
329
  interface = app.create_interface()
330
 
331
  # Launch the app
 
1
  import cv2
2
  import numpy as np
3
+ from transformers import (
4
+ CLIPProcessor, CLIPModel,
5
+ BlipProcessor, BlipForConditionalGeneration,
6
+ Blip2Processor, Blip2ForConditionalGeneration,
7
+ AutoProcessor, AutoModelForObjectDetection
8
+ )
9
  import torch
10
  from PIL import Image
11
  import faiss
 
16
  import os
17
  import shutil
18
  from tqdm import tqdm
 
19
 
20
+ class EnhancedVideoAnalyzer:
21
+ def __init__(self):
 
 
 
22
  self.logger = self.setup_logger()
 
 
23
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  self.logger.info(f"Using device: {self.device}")
25
+
26
+ # Initialize CLIP for general scene understanding
27
+ self.logger.info("Loading CLIP model...")
28
+ self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
29
+ self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
30
+
31
+ # Initialize BLIP-2 for detailed scene description
32
+ self.logger.info("Loading BLIP-2 model...")
33
+ self.blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
34
+ self.blip2_model = Blip2ForConditionalGeneration.from_pretrained(
35
+ "Salesforce/blip2-opt-2.7b",
36
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
37
+ ).to(self.device)
38
+
39
+ # Initialize Object Detection model
40
+ self.logger.info("Loading object detection model...")
41
+ self.obj_processor = AutoProcessor.from_pretrained("microsoft/table-transformer-detection")
42
+ self.obj_model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection").to(self.device)
43
+
44
  self.frame_index = None
45
  self.frame_data = []
46
+ self.target_size = (384, 384) # Increased size for better detail recognition
47
+ self.batch_size = 4
48
+
49
+ # Set all models to evaluation mode
50
+ self.clip_model.eval()
51
+ self.blip2_model.eval()
52
+ self.obj_model.eval()
53
 
54
  def setup_logger(self) -> logging.Logger:
55
+ logger = logging.getLogger('EnhancedVideoAnalyzer')
 
 
 
56
  if logger.handlers:
57
  logger.handlers.clear()
 
58
  logger.setLevel(logging.INFO)
59
  handler = logging.StreamHandler()
60
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
61
  handler.setFormatter(formatter)
62
  logger.addHandler(handler)
63
  return logger
64
 
65
  @torch.no_grad()
66
+ def analyze_frame(self, image: Image.Image) -> Dict:
67
+ """Comprehensive frame analysis"""
68
  try:
69
+ # 1. Generate detailed caption using BLIP-2
70
+ inputs = self.blip2_processor(image, return_tensors="pt").to(self.device, torch.float16)
71
+ caption = self.blip2_model.generate(**inputs, max_new_tokens=50)
72
+ caption_text = self.blip2_processor.decode(caption[0], skip_special_tokens=True)
73
+
74
+ # 2. Detect objects
75
+ obj_inputs = self.obj_processor(images=image, return_tensors="pt").to(self.device)
76
+ obj_outputs = self.obj_model(**obj_inputs)
77
+
78
+ # Process object detection results
79
+ target_sizes = torch.tensor([image.size[::-1]])
80
+ results = self.obj_processor.post_process_object_detection(
81
+ obj_outputs, threshold=0.5, target_sizes=target_sizes
82
+ )[0]
83
+
84
+ detected_objects = []
85
+ for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
86
+ detected_objects.append({
87
+ "label": self.obj_processor.model.config.id2label[label.item()],
88
+ "confidence": score.item()
89
+ })
90
+
91
+ return {
92
+ "caption": caption_text,
93
+ "objects": detected_objects
94
+ }
95
+
96
  except Exception as e:
97
+ self.logger.error(f"Error in frame analysis: {str(e)}")
98
+ return {"caption": "Error analyzing frame", "objects": []}
99
 
100
+ def extract_keyframes(self, video_path: str, max_frames: int = 15) -> List[Tuple[int, np.ndarray]]:
101
+ """Extract key frames using scene detection"""
102
  cap = cv2.VideoCapture(video_path)
103
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
104
  fps = cap.get(cv2.CAP_PROP_FPS)
105
+
106
+ # Calculate frame interval to get approximately max_frames
107
+ frame_interval = max(1, total_frames // max_frames)
108
+
109
+ frames = []
110
+ frame_positions = []
111
+ prev_gray = None
112
+
113
+ with tqdm(total=total_frames, desc="Extracting frames") as pbar:
114
+ while cap.isOpened() and len(frames) < max_frames:
115
+ ret, frame = cap.read()
116
+ if not ret:
117
+ break
118
+
119
+ # Convert to grayscale for scene detection
120
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
121
+
122
+ if prev_gray is not None:
123
+ # Calculate frame difference
124
+ diff = cv2.absdiff(gray, prev_gray)
125
+ mean_diff = np.mean(diff)
126
+
127
+ # If significant change or first/last frame
128
+ if mean_diff > 30 or len(frames) == 0:
129
+ frames.append(frame)
130
+ frame_positions.append(cap.get(cv2.CAP_PROP_POS_FRAMES))
131
+
132
+ prev_gray = gray
133
+ pbar.update(1)
134
+
135
  cap.release()
136
+ return list(zip(frame_positions, frames))
 
 
 
 
 
 
137
 
138
  @torch.no_grad()
139
+ def process_video(self, video_path: str) -> None:
140
+ """Process video with comprehensive analysis"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  self.logger.info(f"Processing video: {video_path}")
142
+ self.frame_data = []
143
+ features_list = []
144
 
145
  try:
146
+ # Extract key frames
147
+ keyframes = self.extract_keyframes(video_path)
148
+ self.logger.info(f"Extracted {len(keyframes)} key frames")
 
 
 
 
 
 
 
149
 
150
+ # Process frames with progress bar
151
+ with tqdm(total=len(keyframes), desc="Analyzing frames") as pbar:
152
+ for frame_pos, frame in keyframes:
153
+ # Convert frame to PIL Image
154
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
155
+ image = Image.fromarray(frame_rgb).resize(self.target_size, Image.LANCZOS)
156
 
157
+ # Analyze frame
158
+ analysis = self.analyze_frame(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # Get CLIP features
161
+ clip_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
162
+ image_features = self.clip_model.get_image_features(**clip_inputs)
163
+
164
+ # Store results
165
+ self.frame_data.append({
166
+ 'frame_number': int(frame_pos),
167
+ 'timestamp': frame_pos / 30.0, # Approximate timestamp
168
+ 'caption': analysis['caption'],
169
+ 'objects': analysis['objects']
170
+ })
171
+
172
+ features_list.append(image_features.cpu().numpy())
173
+ pbar.update(1)
174
+
 
 
 
 
175
  # Create FAISS index
176
+ if features_list:
177
+ features_array = np.vstack(features_list)
178
+ self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
179
+ self.frame_index.add(features_array)
180
+
181
+ self.logger.info("Video processing completed successfully")
182
 
183
  except Exception as e:
184
  self.logger.error(f"Error processing video: {str(e)}")
185
  raise
186
 
187
+ @torch.no_grad()
188
+ def query_video(self, query_text: str, k: int = 4) -> List[Dict]:
189
+ """Enhanced query processing"""
 
190
  try:
191
+ # Process query with CLIP
192
+ text_inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
193
+ text_features = self.clip_model.get_text_features(**text_inputs)
194
 
195
+ # Search for relevant frames
196
  distances, indices = self.frame_index.search(
197
+ text_features.cpu().numpy(),
198
  k
199
  )
200
 
201
+ # Prepare results with enhanced information
202
  results = []
203
+ for distance, idx in zip(distances[0], indices[0]):
204
  frame_info = self.frame_data[idx].copy()
205
+
206
+ # Add relevance score
207
  frame_info['relevance_score'] = float(1 / (1 + distance))
 
208
 
209
+ # Add object summary
210
+ obj_summary = ", ".join(obj["label"] for obj in frame_info['objects'][:3])
211
+ if obj_summary:
212
+ frame_info['object_summary'] = f"Objects detected: {obj_summary}"
213
+
214
+ results.append(frame_info)
215
+
216
  return results
217
+
218
  except Exception as e:
219
  self.logger.error(f"Error querying video: {str(e)}")
220
  raise
221
 
222
+ class VideoQAApp:
223
  def __init__(self):
224
+ self.analyzer = EnhancedVideoAnalyzer()
225
  self.current_video_path = None
226
  self.processed = False
227
  self.temp_dir = tempfile.mkdtemp()
228
 
229
  def __del__(self):
 
230
  if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
231
  shutil.rmtree(self.temp_dir, ignore_errors=True)
232
 
233
  def process_video(self, video_file):
234
+ """Process video with progress updates"""
235
  try:
236
  if video_file is None:
237
+ return "Please upload a video first.", gr.Progress(0)
238
 
239
  video_path = video_file.name
240
  temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
241
  shutil.copy2(video_path, temp_video_path)
242
 
243
  self.current_video_path = temp_video_path
244
+ self.analyzer.process_video(self.current_video_path)
 
245
  self.processed = True
246
+
247
+ return "Video processed successfully! You can now ask questions about the video.", gr.Progress(100)
248
 
249
  except Exception as e:
250
  self.processed = False
251
+ return f"Error processing video: {str(e)}", gr.Progress(0)
252
 
253
  def query_video(self, query_text):
254
+ """Query video with comprehensive results"""
255
  if not self.processed:
256
  return None, "Please process a video first."
257
 
258
  try:
259
+ results = self.analyzer.query_video(query_text)
 
260
  frames = []
261
  descriptions = []
262
 
 
273
 
274
  description = f"Timestamp: {result['timestamp']:.2f}s\n"
275
  description += f"Scene Description: {result['caption']}\n"
276
+ if 'object_summary' in result:
277
+ description += f"{result['object_summary']}\n"
278
  description += f"Relevance Score: {result['relevance_score']:.2f}"
279
  descriptions.append(description)
280
 
281
  cap.release()
282
 
283
+ combined_description = "\n\nScene Analysis:\n\n"
 
284
  for i, desc in enumerate(descriptions, 1):
285
  combined_description += f"Frame {i}:\n{desc}\n\n"
286
 
 
290
  return None, f"Error querying video: {str(e)}"
291
 
292
  def create_interface(self):
293
+ """Create Gradio interface"""
294
+ with gr.Blocks(title="Video Question Answering") as interface:
295
+ gr.Markdown("# Advanced Video Question Answering")
296
+ gr.Markdown("Upload a video and ask questions about any aspect of its content!")
297
 
298
  with gr.Row():
299
  video_input = gr.File(
300
+ label="Upload Video (Recommended: 30 seconds to 5 minutes)",
301
  file_types=["video"],
302
  )
303
  process_button = gr.Button("Process Video")
304
 
305
+ with gr.Row():
306
+ status_output = gr.Textbox(
307
+ label="Status",
308
+ interactive=False
309
+ )
310
+ progress = gr.Progress()
311
 
312
  with gr.Row():
313
  query_input = gr.Textbox(
314
+ label="Ask anything about the video",
315
  placeholder="What's happening in the video?"
316
  )
317
  query_button = gr.Button("Search")
318
 
319
+ gallery = gr.Gallery(
320
+ label="Retrieved Frames",
321
+ show_label=True,
322
+ elem_id="gallery",
323
+ columns=[2],
324
+ rows=[2],
325
+ height="auto"
326
+ )
 
327
 
328
  descriptions = gr.Textbox(
329
+ label="Scene Analysis",
330
  interactive=False,
331
  lines=10
332
  )
 
334
  process_button.click(
335
  fn=self.process_video,
336
  inputs=[video_input],
337
+ outputs=[status_output, progress]
338
  )
339
 
340
  query_button.click(
 
346
  return interface
347
 
348
  # Initialize and create the interface
349
+ app = VideoQAApp()
350
  interface = app.create_interface()
351
 
352
  # Launch the app