zamalali commited on
Commit
9a14671
·
1 Parent(s): 03b7d0b

Clean push: only core files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +251 -0
  3. main.py +372 -0
  4. requirements.txt +23 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ import spaces
6
+
7
+ from main import (
8
+ run,
9
+ detect_scenes,
10
+ extract_keyframes,
11
+ generate_scene_caption,
12
+ generate_video_summary,
13
+ generate_video_summary_groq,
14
+ vqa_matches,
15
+ semantic_matches,
16
+ remove_scenes,
17
+ )
18
+
19
+ # Load environment variables
20
+ load_dotenv()
21
+ if not os.getenv("HF_TOKEN"):
22
+ raise ValueError("❌ Error: HF_TOKEN not found in .env file")
23
+
24
+ @spaces.GPU
25
+ def process_video(video_path, query, progress=gr.Progress()):
26
+ """Scene‐filtering tab: remove scenes matching the query."""
27
+ try:
28
+ os.makedirs("outputs", exist_ok=True)
29
+ output_path = os.path.join("outputs", "trimmed_video.mp4")
30
+
31
+ # 1) Detect scenes
32
+ progress(0.0, desc="Detecting scenes...")
33
+ scenes = detect_scenes(video_path)
34
+
35
+ # 2) Extract keyframes
36
+ progress(0.2, desc="Extracting keyframes...")
37
+ keyframes = extract_keyframes(video_path, scenes)
38
+
39
+ # 3) Caption each keyframe
40
+ progress(0.4, desc="Generating captions...")
41
+ captions = [generate_scene_caption(frame) for _, frame in keyframes]
42
+
43
+ # 4) VQA + semantic filtering
44
+ progress(0.6, desc="Analyzing scenes...")
45
+ vqa_mask = vqa_matches(keyframes, query)
46
+ sem_idxs, _= semantic_matches(captions, query)
47
+
48
+ # 5) Build removal list
49
+ to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs))
50
+
51
+ # 6) Trim via ffmpeg
52
+ progress(0.8, desc="Processing video...")
53
+ if to_remove:
54
+ remove_scenes(video_path, scenes, to_remove, output_path)
55
+
56
+ # Verify the output video
57
+ if not os.path.exists(output_path):
58
+ return None, "❌ Error: Failed to create output video"
59
+
60
+ # Check if video is valid
61
+ cap = cv2.VideoCapture(output_path)
62
+ if not cap.isOpened():
63
+ return None, "❌ Error: Generated video is invalid"
64
+ cap.release()
65
+
66
+ stats = [
67
+ "✅ Processing complete!",
68
+ f"📊 Total scenes: {len(scenes)}",
69
+ f"🗑️ Scenes removed: {len(to_remove)}",
70
+ f"🎬 Scenes kept: {len(scenes)-len(to_remove)}",
71
+ "\n🔍 Scene captions:",
72
+ *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)]
73
+ ]
74
+ return output_path, "\n".join(stats)
75
+ else:
76
+ return None, "⚠️ No matching scenes found; no trimming done."
77
+ except Exception as e:
78
+ return None, f"❌ Error: {e}"
79
+
80
+ @spaces.GPU
81
+ def generate_video_description(video_path, progress=gr.Progress()):
82
+ """Video‐description tab: full scene‐by‐scene summary."""
83
+ try:
84
+ progress(0.0, desc="Detecting scenes...")
85
+ scenes = detect_scenes(video_path)
86
+
87
+ progress(0.3, desc="Extracting keyframes...")
88
+ keyframes = extract_keyframes(video_path, scenes)
89
+
90
+ progress(0.6, desc="Captioning scenes...")
91
+ captions = [generate_scene_caption(frame) for _, frame in keyframes]
92
+
93
+ # build & return the summary paragraph
94
+ summary = generate_video_summary(captions)
95
+ return summary
96
+ except Exception as e:
97
+ return f"❌ Error: {e}"
98
+
99
+ @spaces.GPU
100
+ def get_frame_description(video_path, frame_number):
101
+ """Frame‐analysis tab: caption a single frame."""
102
+ try:
103
+ cap = cv2.VideoCapture(video_path)
104
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_number))
105
+ ret, frame = cap.read()
106
+ cap.release()
107
+
108
+ if not ret:
109
+ return "❌ Invalid frame number"
110
+ return f"Frame {frame_number}:\n{generate_scene_caption(frame)}"
111
+ except Exception as e:
112
+ return f"❌ Error: {e}"
113
+
114
+ # ─── Gradio UI ────────────────────────────────────────────────────────────────
115
+
116
+ with gr.Blocks(theme=gr.themes.Soft(), css="""
117
+ footer {visibility: hidden}
118
+ .custom-footer {
119
+ text-align: center;
120
+ margin-top: 2em;
121
+ margin-bottom: 1em;
122
+ color: #666;
123
+ }
124
+ .description {
125
+ color: #666;
126
+ font-size: 0.9em;
127
+ line-height: 1.5;
128
+ }
129
+ .tech-stack {
130
+ background: #f5f5f5;
131
+ padding: 1em;
132
+ border-radius: 8px;
133
+ margin: 1em 0;
134
+ }
135
+ """) as demo:
136
+ gr.Markdown("""
137
+ # Videoxity
138
+
139
+ A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.
140
+
141
+ <div class="description">
142
+ This application demonstrates the capabilities of modern AI in video processing, offering a foundation for developers to build upon and optimize.
143
+ Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding.
144
+ </div>
145
+
146
+ <div class="tech-stack">
147
+ <strong>Technical Stack:</strong>
148
+ - Scene Detection: PySceneDetect with ContentDetector
149
+ - Vision Models: BLIP (Image Captioning & VQA)
150
+ - Language Models: Groq LLM (Llama 3.1)
151
+ - Video Processing: OpenCV & FFmpeg
152
+ - Embeddings: BGE-Small for semantic search
153
+ </div>
154
+ """)
155
+
156
+ with gr.Tabs():
157
+ # 1) Scene Filtering
158
+ with gr.TabItem("Frames to Cut"):
159
+ gr.Markdown("""
160
+ ### Remove specific scenes from your video
161
+ Upload a video and describe which scenes you want to remove. The AI will analyze each scene and cut out the matching ones.
162
+
163
+ Examples:
164
+ - "Remove the part where there is a cat in the video"
165
+ - "Cut out the scene where people are dancing"
166
+ """)
167
+ with gr.Row():
168
+ with gr.Column():
169
+ vid1 = gr.Video(
170
+ label="Upload Video",
171
+ format="mp4",
172
+ interactive=True
173
+ )
174
+ qry1 = gr.Textbox(
175
+ label="Scenes to Remove",
176
+ placeholder="e.g., 'Remove the part where there is a cat in the video'",
177
+ lines=2
178
+ )
179
+ btn1 = gr.Button("Process Video", variant="primary")
180
+ with gr.Column():
181
+ outVid = gr.Video(
182
+ label="Processed Video",
183
+ format="mp4",
184
+ interactive=True
185
+ )
186
+ outTxt = gr.Textbox(label="Results", lines=10)
187
+ btn1.click(
188
+ fn=process_video,
189
+ inputs=[vid1, qry1],
190
+ outputs=[outVid, outTxt]
191
+ )
192
+
193
+ # 2) Video Description
194
+ with gr.TabItem("Video Description"):
195
+ gr.Markdown("""
196
+ ### Generate a comprehensive description of your video
197
+ Get AI-generated descriptions for all scenes in your video.
198
+ """)
199
+ with gr.Row():
200
+ with gr.Column():
201
+ vid2 = gr.Video(label="Upload Video")
202
+ btn2 = gr.Button("Generate Description", variant="primary")
203
+ with gr.Column():
204
+ outDesc = gr.Textbox(
205
+ label="Video Description",
206
+ lines=15,
207
+ show_copy_button=True
208
+ )
209
+ btn2.click(
210
+ fn=generate_video_description,
211
+ inputs=[vid2],
212
+ outputs=[outDesc]
213
+ )
214
+
215
+ # 3) Frame Analysis
216
+ with gr.TabItem("Frame Analysis"):
217
+ gr.Markdown("""
218
+ ### Analyze specific frames in your video
219
+ Get detailed descriptions for individual frames.
220
+ """)
221
+ with gr.Row():
222
+ with gr.Column():
223
+ vid3 = gr.Video(label="Upload Video")
224
+ fn3 = gr.Number(
225
+ label="Frame Number",
226
+ value=0,
227
+ precision=0,
228
+ minimum=0
229
+ )
230
+ btn3 = gr.Button("Analyze Frame", variant="primary")
231
+ with gr.Column():
232
+ outFrm = gr.Textbox(
233
+ label="Frame Description",
234
+ lines=5,
235
+ show_copy_button=True
236
+ )
237
+ btn3.click(
238
+ fn=get_frame_description,
239
+ inputs=[vid3, fn3],
240
+ outputs=[outFrm]
241
+ )
242
+
243
+ # Add custom centered footer
244
+ gr.Markdown("""
245
+ <div class="custom-footer">
246
+ Made with ❤️
247
+ </div>
248
+ """, elem_classes=["custom-footer"])
249
+
250
+ if __name__ == "__main__":
251
+ demo.launch(share=True, show_error=True, show_api=False)
main.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import cv2
4
+ import subprocess
5
+ from tqdm import tqdm # add this at the top
6
+ from PIL import Image
7
+ from dotenv import load_dotenv
8
+ from langchain_groq import ChatGroq
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from transformers import pipeline
11
+ from scenedetect import SceneManager, open_video, ContentDetector
12
+ from sentence_transformers import SentenceTransformer, util
13
+
14
+ # ─── 1. AUTH & MODELS ────────────────────────────────────────────────────────────
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
20
+
21
+ if not HF_TOKEN:
22
+ print("❌ Error: HF_TOKEN not found in .env file")
23
+ sys.exit(1)
24
+
25
+ # Initialize models with proper configurations
26
+ captioner = pipeline(
27
+ "image-to-text",
28
+ model="Salesforce/blip-image-captioning-base",
29
+ device="cpu"
30
+ )
31
+
32
+ vl_pipeline = pipeline(
33
+ "visual-question-answering",
34
+ model="Salesforce/blip-vqa-base",
35
+ device="cpu"
36
+ )
37
+
38
+ elaborator = pipeline(
39
+ "text-generation",
40
+ model="gpt2-medium",
41
+ device="cpu",
42
+ max_new_tokens=500, # Use max_new_tokens instead of max_length
43
+ do_sample=True,
44
+ top_p=0.9,
45
+ temperature=0.7
46
+ )
47
+
48
+ embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
49
+
50
+
51
+ # ─── 2. HELPERS ──────────────────────────────────────────────────────────────────
52
+
53
+ def run_ffmpeg(cmd):
54
+ full = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"] + cmd
55
+ p = subprocess.Popen(full, stderr=subprocess.PIPE)
56
+ _, err = p.communicate()
57
+ if p.returncode != 0:
58
+ print("❌ FFmpeg error:\n", err.decode())
59
+ sys.exit(1)
60
+
61
+
62
+ # ─── 3. SCENE DETECTION & KEYFRAMES ──────────────────────────────────────────────
63
+
64
+ def detect_scenes(video_path, thresh=15.0):
65
+ v = open_video(video_path)
66
+ mgr = SceneManager()
67
+ mgr.add_detector(ContentDetector(threshold=thresh))
68
+ mgr.detect_scenes(v)
69
+ return mgr.get_scene_list()
70
+
71
+
72
+
73
+ def get_removal_indices_groq(captions, query):
74
+ llm = ChatGroq(
75
+ model="llama-3.1-8b-instant",
76
+ temperature=0.2,
77
+ max_tokens=500
78
+ )
79
+
80
+ prompt = ChatPromptTemplate.from_messages([
81
+ (
82
+ "system",
83
+ "You are a helpful assistant for video analysis. The user will give you a list of scene captions, "
84
+ "each labeled with an index like [1], [2], ..., and a filtering instruction like 'remove food scenes'.\n\n"
85
+ "Return ONLY the list of indexes that should be removed — e.g., [2, 5, 9]\n"
86
+ "⚠️ Do not explain, describe, or add any commentary. Your response MUST be a valid Python list of integers."
87
+ ),
88
+ (
89
+ "human",
90
+ "Filtering instruction: {query}\n\nCaptions:\n{captions}"
91
+ )
92
+ ])
93
+
94
+ chain = prompt | llm
95
+ captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
96
+
97
+ try:
98
+ response = chain.invoke({"query": query, "captions": captions_formatted})
99
+ to_remove = eval(response.content.strip())
100
+
101
+ if not isinstance(to_remove, list) or not all(isinstance(i, int) for i in to_remove):
102
+ raise ValueError("Invalid format")
103
+
104
+ except Exception as e:
105
+ print(f"❌ LLM returned invalid output: {response.content}")
106
+ to_remove = []
107
+
108
+ return to_remove
109
+
110
+
111
+ def groq_llm(prompt):
112
+ llm = ChatGroq(
113
+ model="llama-3.1-8b-instant",
114
+ temperature=0.2,
115
+ max_tokens=500
116
+ )
117
+ return llm.invoke(prompt).content.strip()
118
+
119
+
120
+
121
+ def extract_keyframes(video_path, scenes):
122
+ cap, frames = cv2.VideoCapture(video_path), []
123
+ for s,e in scenes:
124
+ mid = (s.get_frames() + e.get_frames()) // 2
125
+ cap.set(cv2.CAP_PROP_POS_FRAMES, mid)
126
+ ok, img = cap.read()
127
+ if ok: frames.append((mid, img))
128
+ cap.release()
129
+ return frames
130
+
131
+
132
+ # ─── 4. DESCRIPTIONS & SUMMARY ───────────────────────────────────────────────────
133
+
134
+ def generate_scene_caption(frame):
135
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
136
+ return captioner(img)[0]["generated_text"]
137
+
138
+ def generate_video_summary_groq(captions):
139
+ """Generate a video summary using Groq LLM."""
140
+ llm = ChatGroq(
141
+ model="llama-3.1-8b-instant",
142
+ temperature=0.2,
143
+ max_tokens=500
144
+ )
145
+
146
+ prompt = ChatPromptTemplate.from_messages([
147
+ (
148
+ "system",
149
+ "You are a helpful assistant for video analysis. The user will give you a list of scene captions from a video. "
150
+ "Your task is to write a concise, narrative summary of what happens in the video, focusing only on the events shown. "
151
+ "Make it engaging and easy to understand. Do not include any titles, links, or external references."
152
+ ),
153
+ (
154
+ "human",
155
+ "Here are the scene captions from the video in order:\n{captions}\n\nPlease provide a narrative summary."
156
+ )
157
+ ])
158
+
159
+ chain = prompt | llm
160
+ captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
161
+
162
+ try:
163
+ response = chain.invoke({"captions": captions_formatted})
164
+ summary = response.content.strip()
165
+
166
+ # Format the final output
167
+ return f"""🎬 Video Summary:
168
+ {summary}
169
+
170
+ 📊 Total Scenes: {len(captions)}
171
+
172
+ 🔍 Key Moments:
173
+ {chr(10).join(f"• {cap}" for cap in captions[:5])}
174
+ ..."""
175
+ except Exception as e:
176
+ print(f"❌ Error generating summary with Groq: {e}")
177
+ return "❌ Error: Failed to generate video summary"
178
+
179
+ def generate_video_summary(captions):
180
+ """
181
+ Generate a video summary using Groq LLM.
182
+ """
183
+ return generate_video_summary_groq(captions)
184
+
185
+
186
+
187
+
188
+ import ast
189
+
190
+ def filter_scenes_with_llm(captions, query, llm):
191
+ """
192
+ Uses an LLM to determine which scenes to remove based on captions and a user query.
193
+
194
+ Args:
195
+ captions (List[str]): List of scene/frame captions.
196
+ query (str): User intent, e.g. "Remove scenes with Trump".
197
+ llm (callable): Function to call your LLM, e.g. `llm(prompt)`.
198
+
199
+ Returns:
200
+ List[int]: List of 0-based frame indexes to remove.
201
+ """
202
+ formatted = "\n".join([f"{i+1}. {cap}" for i, cap in enumerate(captions)])
203
+ prompt = f"""
204
+ You're an intelligent video assistant.
205
+
206
+ The user wants to: **{query}**
207
+
208
+ Below are numbered captions for each scene in a video:
209
+ {formatted}
210
+
211
+ 👉 Return a Python list of only the scene numbers that should be removed based on the user query.
212
+ 👉 ONLY return the list like this: [3, 5, 11]. No explanation.
213
+ """
214
+
215
+ # Run LLM
216
+ response = llm(prompt)
217
+
218
+ try:
219
+ result = ast.literal_eval(response.strip())
220
+ result = [i-1 for i in result] # convert to 0-based index
221
+ return result
222
+ except:
223
+ print("⚠️ Failed to parse LLM output:", response)
224
+ return []
225
+
226
+ # ─── 5. FILTERING ───────────────────────────────────────────────────────────────
227
+ def group_indices(indices):
228
+ """Group consecutive indices together as chunks."""
229
+ if not indices:
230
+ return []
231
+ indices = sorted(indices)
232
+ groups = [[indices[0]]]
233
+ for i in indices[1:]:
234
+ if i == groups[-1][-1] + 1:
235
+ groups[-1].append(i)
236
+ else:
237
+ groups.append([i])
238
+ return groups
239
+
240
+
241
+
242
+ def vqa_matches(keyframes, question):
243
+ flags = []
244
+ for _,frame in keyframes:
245
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
246
+ ans = vl_pipeline({"image": img, "question": question})
247
+ flags.append("yes" in ans[0]["answer"].lower())
248
+ return flags
249
+
250
+ def semantic_matches(captions, prompt, thresh=0.8):
251
+ embs = embedder.encode(captions, convert_to_tensor=True)
252
+ q = embedder.encode(prompt, convert_to_tensor=True)
253
+ sims = util.cos_sim(q, embs)[0]
254
+ return [i for i,s in enumerate(sims) if s>=thresh], sims.tolist()
255
+
256
+
257
+ # ─── 6. TRIMMING ────────────────────────────────────────────────────────────────
258
+
259
+ def remove_scenes(video_path, scenes, to_remove, out="trimmed.mp4"):
260
+ times = [(float(s.get_seconds()), float(e.get_seconds())) for s,e in scenes]
261
+
262
+ # Group deletions
263
+ remove_groups = group_indices(to_remove)
264
+
265
+ # Threshold: max N consecutive scenes to allow trimming
266
+ MAX_REMOVE_GROUP_SIZE = 4
267
+
268
+ # Adjust `to_remove`: only allow small groups or isolated removals
269
+ filtered_remove = []
270
+ if len(scenes) > 3:
271
+ last_scene_idx = len(scenes) - 1
272
+ for i in range(last_scene_idx - 2, last_scene_idx + 1):
273
+ if i in filtered_remove:
274
+ filtered_remove.remove(i)
275
+
276
+ for group in remove_groups:
277
+ if len(group) <= MAX_REMOVE_GROUP_SIZE:
278
+ filtered_remove.extend(group)
279
+
280
+ print(f"🧩 Filtered scenes to remove (after capping long chunks): {filtered_remove}")
281
+
282
+ # Final list of segments to keep
283
+ keep = [t for i,t in enumerate(times) if i not in filtered_remove]
284
+
285
+
286
+ # Create a temporary directory for segments
287
+ os.makedirs("temp_segments", exist_ok=True)
288
+
289
+ try:
290
+ parts = []
291
+ for i,(ss,tt) in enumerate(keep):
292
+ fn = os.path.join("temp_segments", f"segment_{i}.mp4")
293
+ # Use proper encoding settings to maintain frame integrity
294
+ run_ffmpeg([
295
+ "-i", video_path,
296
+ "-ss", str(ss),
297
+ "-to", str(tt),
298
+ "-c:v", "libx264", # Use H.264 codec
299
+ "-preset", "medium", # Balance between speed and quality
300
+ "-crf", "23", # Constant Rate Factor for quality
301
+ "-c:a", "aac", # Audio codec
302
+ "-b:a", "128k", # Audio bitrate
303
+ "-movflags", "+faststart", # Enable fast start for web playback
304
+ fn
305
+ ])
306
+ parts.append(fn)
307
+
308
+ # Create concat file
309
+ with open("parts.txt", "w") as f:
310
+ for p in parts:
311
+ f.write(f"file '{p}'\n")
312
+
313
+ # Concatenate segments with proper encoding
314
+ run_ffmpeg([
315
+ "-f", "concat",
316
+ "-safe", "0",
317
+ "-i", "parts.txt",
318
+ "-c:v", "libx264",
319
+ "-preset", "medium",
320
+ "-crf", "23",
321
+ "-c:a", "aac",
322
+ "-b:a", "128k",
323
+ "-movflags", "+faststart",
324
+ out
325
+ ])
326
+
327
+ finally:
328
+ # Cleanup
329
+ for p in parts:
330
+ if os.path.exists(p):
331
+ os.remove(p)
332
+ if os.path.exists("parts.txt"):
333
+ os.remove("parts.txt")
334
+ if os.path.exists("temp_segments"):
335
+ os.rmdir("temp_segments")
336
+
337
+
338
+ # ─── 7. MAIN PIPELINE ──────────────────────────────────────────────────────────
339
+
340
+ def run(video, query):
341
+ print(f"\n🎥 Video: {video}\n🔎 Query: '{query}'\n")
342
+
343
+ scenes = detect_scenes(video)
344
+ print(f"🔢 {len(scenes)} scenes detected.")
345
+
346
+ keyframes = extract_keyframes(video, scenes)
347
+ print(f"🖼️ {len(keyframes)} keyframes extracted.\n")
348
+
349
+ captions = [generate_scene_caption(f) for _, f in tqdm(keyframes, desc="Generating captions")]
350
+ summary = generate_video_summary(captions)
351
+ print("\n--- Video Summary ---")
352
+ print(summary)
353
+
354
+ # 🧠 Let the LLM decide which scenes to remove based on captions
355
+ to_remove = filter_scenes_with_llm(captions, query, groq_llm)
356
+ print(f"\n🔴 Scenes to remove: {to_remove}")
357
+
358
+ if to_remove:
359
+ remove_scenes(video, scenes, to_remove)
360
+ print("✅ Trimmed video saved as `trimmed.mp4`.")
361
+ else:
362
+ print("⚠️ No matching scenes found; no trimming done.")
363
+
364
+ return to_remove # Optional: return for external use
365
+
366
+ # ─── 8. ENTRY POINT ─────────────────────────────────────────────────────────────
367
+
368
+ if __name__ == "__main__":
369
+ if len(sys.argv)<3:
370
+ print("Usage: python main.py <video.mp4> \"your query here\"")
371
+ sys.exit(1)
372
+ run(sys.argv[1], sys.argv[2])
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ gradio>=4.19.2
3
+ opencv-python>=4.9.0.80
4
+ python-dotenv>=1.0.0
5
+ Pillow>=10.2.0
6
+ spaces>=0.1.0
7
+
8
+ # Video processing
9
+ scenedetect>=0.6.3
10
+ ffmpeg-python>=0.2.0
11
+
12
+ # AI/ML models
13
+ transformers>=4.37.2
14
+ sentence-transformers>=2.5.1
15
+ torch>=2.2.0
16
+
17
+ # LLM and embeddings
18
+ langchain-groq>=0.0.1
19
+ langchain-core>=0.1.27
20
+
21
+ # Utilities
22
+ tqdm>=4.66.1
23
+ numpy>=1.26.3