Spaces:

fau
/

videoxity

Running

App Files Files Community

zamalali commited on Jun 8

Commit

9a14671

1 Parent(s): 03b7d0b

Clean push: only core files

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +251 -0
main.py +372 -0
requirements.txt +23 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

app.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import os
+import cv2
+import gradio as gr
+from dotenv import load_dotenv
+import spaces
+from main import (
+    run,
+    detect_scenes,
+    extract_keyframes,
+    generate_scene_caption,
+    generate_video_summary,
+    generate_video_summary_groq,
+    vqa_matches,
+    semantic_matches,
+    remove_scenes,
+)
+# Load environment variables
+load_dotenv()
+if not os.getenv("HF_TOKEN"):
+    raise ValueError("❌ Error: HF_TOKEN not found in .env file")
+@spaces.GPU
+def process_video(video_path, query, progress=gr.Progress()):
+    """Scene‐filtering tab: remove scenes matching the query."""
+    try:
+        os.makedirs("outputs", exist_ok=True)
+        output_path = os.path.join("outputs", "trimmed_video.mp4")
+        # 1) Detect scenes
+        progress(0.0, desc="Detecting scenes...")
+        scenes = detect_scenes(video_path)
+        # 2) Extract keyframes
+        progress(0.2, desc="Extracting keyframes...")
+        keyframes = extract_keyframes(video_path, scenes)
+        # 3) Caption each keyframe
+        progress(0.4, desc="Generating captions...")
+        captions = [generate_scene_caption(frame) for _, frame in keyframes]
+        # 4) VQA + semantic filtering
+        progress(0.6, desc="Analyzing scenes...")
+        vqa_mask   = vqa_matches(keyframes, query)
+        sem_idxs, _= semantic_matches(captions, query)
+        # 5) Build removal list
+        to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs))
+        # 6) Trim via ffmpeg
+        progress(0.8, desc="Processing video...")
+        if to_remove:
+            remove_scenes(video_path, scenes, to_remove, output_path)
+            # Verify the output video
+            if not os.path.exists(output_path):
+                return None, "❌ Error: Failed to create output video"
+            # Check if video is valid
+            cap = cv2.VideoCapture(output_path)
+            if not cap.isOpened():
+                return None, "❌ Error: Generated video is invalid"
+            cap.release()
+            stats = [
+                "✅ Processing complete!",
+                f"📊 Total scenes: {len(scenes)}",
+                f"🗑️ Scenes removed: {len(to_remove)}",
+                f"🎬 Scenes kept: {len(scenes)-len(to_remove)}",
+                "\n🔍 Scene captions:",
+                *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)]
+            ]
+            return output_path, "\n".join(stats)
+        else:
+            return None, "⚠️ No matching scenes found; no trimming done."
+    except Exception as e:
+        return None, f"❌ Error: {e}"
+@spaces.GPU
+def generate_video_description(video_path, progress=gr.Progress()):
+    """Video‐description tab: full scene‐by‐scene summary."""
+    try:
+        progress(0.0, desc="Detecting scenes...")
+        scenes = detect_scenes(video_path)
+        progress(0.3, desc="Extracting keyframes...")
+        keyframes = extract_keyframes(video_path, scenes)
+        progress(0.6, desc="Captioning scenes...")
+        captions = [generate_scene_caption(frame) for _, frame in keyframes]
+        # build & return the summary paragraph
+        summary = generate_video_summary(captions)
+        return summary
+    except Exception as e:
+        return f"❌ Error: {e}"
+@spaces.GPU
+def get_frame_description(video_path, frame_number):
+    """Frame‐analysis tab: caption a single frame."""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_number))
+        ret, frame = cap.read()
+        cap.release()
+        if not ret:
+            return "❌ Invalid frame number"
+        return f"Frame {frame_number}:\n{generate_scene_caption(frame)}"
+    except Exception as e:
+        return f"❌ Error: {e}"
+# ─── Gradio UI ────────────────────────────────────────────────────────────────
+with gr.Blocks(theme=gr.themes.Soft(), css="""
+    footer {visibility: hidden}
+    .custom-footer {
+        text-align: center;
+        margin-top: 2em;
+        margin-bottom: 1em;
+        color: #666;
+    }
+    .description {
+        color: #666;
+        font-size: 0.9em;
+        line-height: 1.5;
+    }
+    .tech-stack {
+        background: #f5f5f5;
+        padding: 1em;
+        border-radius: 8px;
+        margin: 1em 0;
+    }
+""") as demo:
+    gr.Markdown("""
+    # Videoxity
+    A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models.
+    <div class="description">
+    This application demonstrates the capabilities of modern AI in video processing, offering a foundation for developers to build upon and optimize.
+    Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding.
+    </div>
+    <div class="tech-stack">
+    <strong>Technical Stack:</strong>
+    - Scene Detection: PySceneDetect with ContentDetector
+    - Vision Models: BLIP (Image Captioning & VQA)
+    - Language Models: Groq LLM (Llama 3.1)
+    - Video Processing: OpenCV & FFmpeg
+    - Embeddings: BGE-Small for semantic search
+    </div>
+    """)
+    with gr.Tabs():
+        # 1) Scene Filtering
+        with gr.TabItem("Frames to Cut"):
+            gr.Markdown("""
+            ### Remove specific scenes from your video
+            Upload a video and describe which scenes you want to remove. The AI will analyze each scene and cut out the matching ones.
+            Examples:
+            - "Remove the part where there is a cat in the video"
+            - "Cut out the scene where people are dancing"
+            """)
+            with gr.Row():
+                with gr.Column():
+                    vid1 = gr.Video(
+                        label="Upload Video",
+                        format="mp4",
+                        interactive=True
+                    )
+                    qry1 = gr.Textbox(
+                        label="Scenes to Remove",
+                        placeholder="e.g., 'Remove the part where there is a cat in the video'",
+                        lines=2
+                    )
+                    btn1 = gr.Button("Process Video", variant="primary")
+                with gr.Column():
+                    outVid = gr.Video(
+                        label="Processed Video",
+                        format="mp4",
+                        interactive=True
+                    )
+                    outTxt = gr.Textbox(label="Results", lines=10)
+            btn1.click(
+                fn=process_video,
+                inputs=[vid1, qry1],
+                outputs=[outVid, outTxt]
+            )
+        # 2) Video Description
+        with gr.TabItem("Video Description"):
+            gr.Markdown("""
+            ### Generate a comprehensive description of your video
+            Get AI-generated descriptions for all scenes in your video.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    vid2 = gr.Video(label="Upload Video")
+                    btn2 = gr.Button("Generate Description", variant="primary")
+                with gr.Column():
+                    outDesc = gr.Textbox(
+                        label="Video Description",
+                        lines=15,
+                        show_copy_button=True
+                    )
+            btn2.click(
+                fn=generate_video_description,
+                inputs=[vid2],
+                outputs=[outDesc]
+            )
+        # 3) Frame Analysis
+        with gr.TabItem("Frame Analysis"):
+            gr.Markdown("""
+            ### Analyze specific frames in your video
+            Get detailed descriptions for individual frames.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    vid3 = gr.Video(label="Upload Video")
+                    fn3 = gr.Number(
+                        label="Frame Number",
+                        value=0,
+                        precision=0,
+                        minimum=0
+                    )
+                    btn3 = gr.Button("Analyze Frame", variant="primary")
+                with gr.Column():
+                    outFrm = gr.Textbox(
+                        label="Frame Description",
+                        lines=5,
+                        show_copy_button=True
+                    )
+            btn3.click(
+                fn=get_frame_description,
+                inputs=[vid3, fn3],
+                outputs=[outFrm]
+            )
+    # Add custom centered footer
+    gr.Markdown("""
+    <div class="custom-footer">
+    Made with ❤️
+    </div>
+    """, elem_classes=["custom-footer"])
+if __name__ == "__main__":
+    demo.launch(share=True, show_error=True, show_api=False)

main.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import os
+import sys
+import cv2
+import subprocess
+from tqdm import tqdm  # add this at the top
+from PIL import Image
+from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from transformers import pipeline
+from scenedetect import SceneManager, open_video, ContentDetector
+from sentence_transformers import SentenceTransformer, util
+# ─── 1. AUTH & MODELS ────────────────────────────────────────────────────────────
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not HF_TOKEN:
+    print("❌ Error: HF_TOKEN not found in .env file")
+    sys.exit(1)
+# Initialize models with proper configurations
+captioner = pipeline(
+    "image-to-text",
+    model="Salesforce/blip-image-captioning-base",
+    device="cpu"
+)
+vl_pipeline = pipeline(
+    "visual-question-answering",
+    model="Salesforce/blip-vqa-base",
+    device="cpu"
+)
+elaborator = pipeline(
+    "text-generation",
+    model="gpt2-medium",
+    device="cpu",
+    max_new_tokens=500,  # Use max_new_tokens instead of max_length
+    do_sample=True,
+    top_p=0.9,
+    temperature=0.7
+)
+embedder = SentenceTransformer("BAAI/bge-small-en-v1.5")
+# ─── 2. HELPERS ──────────────────────────────────────────────────────────────────
+def run_ffmpeg(cmd):
+    full = ["ffmpeg", "-hide_banner", "-loglevel", "error", "-y"] + cmd
+    p = subprocess.Popen(full, stderr=subprocess.PIPE)
+    _, err = p.communicate()
+    if p.returncode != 0:
+        print("❌ FFmpeg error:\n", err.decode())
+        sys.exit(1)
+# ─── 3. SCENE DETECTION & KEYFRAMES ──────────────────────────────────────────────
+def detect_scenes(video_path, thresh=15.0):
+    v = open_video(video_path)
+    mgr = SceneManager()
+    mgr.add_detector(ContentDetector(threshold=thresh))
+    mgr.detect_scenes(v)
+    return mgr.get_scene_list()
+def get_removal_indices_groq(captions, query):
+    llm = ChatGroq(
+        model="llama-3.1-8b-instant",
+        temperature=0.2,
+        max_tokens=500
+    )
+    prompt = ChatPromptTemplate.from_messages([
+        (
+            "system",
+            "You are a helpful assistant for video analysis. The user will give you a list of scene captions, "
+            "each labeled with an index like [1], [2], ..., and a filtering instruction like 'remove food scenes'.\n\n"
+            "Return ONLY the list of indexes that should be removed — e.g., [2, 5, 9]\n"
+            "⚠️ Do not explain, describe, or add any commentary. Your response MUST be a valid Python list of integers."
+        ),
+        (
+            "human",
+            "Filtering instruction: {query}\n\nCaptions:\n{captions}"
+        )
+    ])
+    chain = prompt | llm
+    captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
+    try:
+        response = chain.invoke({"query": query, "captions": captions_formatted})
+        to_remove = eval(response.content.strip())
+        if not isinstance(to_remove, list) or not all(isinstance(i, int) for i in to_remove):
+            raise ValueError("Invalid format")
+    except Exception as e:
+        print(f"❌ LLM returned invalid output: {response.content}")
+        to_remove = []
+    return to_remove
+def groq_llm(prompt):
+    llm = ChatGroq(
+        model="llama-3.1-8b-instant",
+        temperature=0.2,
+        max_tokens=500
+    )
+    return llm.invoke(prompt).content.strip()
+def extract_keyframes(video_path, scenes):
+    cap, frames = cv2.VideoCapture(video_path), []
+    for s,e in scenes:
+        mid = (s.get_frames() + e.get_frames()) // 2
+        cap.set(cv2.CAP_PROP_POS_FRAMES, mid)
+        ok, img = cap.read()
+        if ok: frames.append((mid, img))
+    cap.release()
+    return frames
+# ─── 4. DESCRIPTIONS & SUMMARY ───────────────────────────────────────────────────
+def generate_scene_caption(frame):
+    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    return captioner(img)[0]["generated_text"]
+def generate_video_summary_groq(captions):
+    """Generate a video summary using Groq LLM."""
+    llm = ChatGroq(
+        model="llama-3.1-8b-instant",
+        temperature=0.2,
+        max_tokens=500
+    )
+    prompt = ChatPromptTemplate.from_messages([
+        (
+            "system",
+            "You are a helpful assistant for video analysis. The user will give you a list of scene captions from a video. "
+            "Your task is to write a concise, narrative summary of what happens in the video, focusing only on the events shown. "
+            "Make it engaging and easy to understand. Do not include any titles, links, or external references."
+        ),
+        (
+            "human",
+            "Here are the scene captions from the video in order:\n{captions}\n\nPlease provide a narrative summary."
+        )
+    ])
+    chain = prompt | llm
+    captions_formatted = "\n".join(f"[{i+1}] {cap.strip()}" for i, cap in enumerate(captions))
+    try:
+        response = chain.invoke({"captions": captions_formatted})
+        summary = response.content.strip()
+        # Format the final output
+        return f"""🎬 Video Summary:
+{summary}
+📊 Total Scenes: {len(captions)}
+🔍 Key Moments:
+{chr(10).join(f"• {cap}" for cap in captions[:5])}
+..."""
+    except Exception as e:
+        print(f"❌ Error generating summary with Groq: {e}")
+        return "❌ Error: Failed to generate video summary"
+def generate_video_summary(captions):
+    """
+    Generate a video summary using Groq LLM.
+    """
+    return generate_video_summary_groq(captions)
+import ast
+def filter_scenes_with_llm(captions, query, llm):
+    """
+    Uses an LLM to determine which scenes to remove based on captions and a user query.
+    Args:
+        captions (List[str]): List of scene/frame captions.
+        query (str): User intent, e.g. "Remove scenes with Trump".
+        llm (callable): Function to call your LLM, e.g. `llm(prompt)`.
+    Returns:
+        List[int]: List of 0-based frame indexes to remove.
+    """
+    formatted = "\n".join([f"{i+1}. {cap}" for i, cap in enumerate(captions)])
+    prompt = f"""
+You're an intelligent video assistant.
+The user wants to: **{query}**
+Below are numbered captions for each scene in a video:
+{formatted}
+👉 Return a Python list of only the scene numbers that should be removed based on the user query.
+👉 ONLY return the list like this: [3, 5, 11]. No explanation.
+"""
+    # Run LLM
+    response = llm(prompt)
+    try:
+        result = ast.literal_eval(response.strip())
+        result = [i-1 for i in result]  # convert to 0-based index
+        return result
+    except:
+        print("⚠️ Failed to parse LLM output:", response)
+        return []
+# ─── 5. FILTERING ───────────────────────────────────────────────────────────────
+def group_indices(indices):
+    """Group consecutive indices together as chunks."""
+    if not indices:
+        return []
+    indices = sorted(indices)
+    groups = [[indices[0]]]
+    for i in indices[1:]:
+        if i == groups[-1][-1] + 1:
+            groups[-1].append(i)
+        else:
+            groups.append([i])
+    return groups
+def vqa_matches(keyframes, question):
+    flags = []
+    for _,frame in keyframes:
+        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        ans = vl_pipeline({"image": img, "question": question})
+        flags.append("yes" in ans[0]["answer"].lower())
+    return flags
+def semantic_matches(captions, prompt, thresh=0.8):
+    embs = embedder.encode(captions, convert_to_tensor=True)
+    q   = embedder.encode(prompt, convert_to_tensor=True)
+    sims = util.cos_sim(q, embs)[0]
+    return [i for i,s in enumerate(sims) if s>=thresh], sims.tolist()
+# ─── 6. TRIMMING ────────────────────────────────────────────────────────────────
+def remove_scenes(video_path, scenes, to_remove, out="trimmed.mp4"):
+    times = [(float(s.get_seconds()), float(e.get_seconds())) for s,e in scenes]
+    # Group deletions
+    remove_groups = group_indices(to_remove)
+    # Threshold: max N consecutive scenes to allow trimming
+    MAX_REMOVE_GROUP_SIZE = 4
+    # Adjust `to_remove`: only allow small groups or isolated removals
+    filtered_remove = []
+    if len(scenes) > 3:
+        last_scene_idx = len(scenes) - 1
+        for i in range(last_scene_idx - 2, last_scene_idx + 1):
+            if i in filtered_remove:
+                filtered_remove.remove(i)
+    for group in remove_groups:
+        if len(group) <= MAX_REMOVE_GROUP_SIZE:
+            filtered_remove.extend(group)
+    print(f"🧩 Filtered scenes to remove (after capping long chunks): {filtered_remove}")
+    # Final list of segments to keep
+    keep = [t for i,t in enumerate(times) if i not in filtered_remove]
+    # Create a temporary directory for segments
+    os.makedirs("temp_segments", exist_ok=True)
+    try:
+        parts = []
+        for i,(ss,tt) in enumerate(keep):
+            fn = os.path.join("temp_segments", f"segment_{i}.mp4")
+            # Use proper encoding settings to maintain frame integrity
+            run_ffmpeg([
+                "-i", video_path,
+                "-ss", str(ss),
+                "-to", str(tt),
+                "-c:v", "libx264",  # Use H.264 codec
+                "-preset", "medium",  # Balance between speed and quality
+                "-crf", "23",  # Constant Rate Factor for quality
+                "-c:a", "aac",  # Audio codec
+                "-b:a", "128k",  # Audio bitrate
+                "-movflags", "+faststart",  # Enable fast start for web playback
+                fn
+            ])
+            parts.append(fn)
+        # Create concat file
+        with open("parts.txt", "w") as f:
+            for p in parts:
+                f.write(f"file '{p}'\n")
+        # Concatenate segments with proper encoding
+        run_ffmpeg([
+            "-f", "concat",
+            "-safe", "0",
+            "-i", "parts.txt",
+            "-c:v", "libx264",
+            "-preset", "medium",
+            "-crf", "23",
+            "-c:a", "aac",
+            "-b:a", "128k",
+            "-movflags", "+faststart",
+            out
+        ])
+    finally:
+        # Cleanup
+        for p in parts:
+            if os.path.exists(p):
+                os.remove(p)
+        if os.path.exists("parts.txt"):
+            os.remove("parts.txt")
+        if os.path.exists("temp_segments"):
+            os.rmdir("temp_segments")
+# ─── 7. MAIN PIPELINE ──────────────────────────────────────────────────────────
+def run(video, query):
+    print(f"\n🎥 Video: {video}\n🔎 Query: '{query}'\n")
+    scenes    = detect_scenes(video)
+    print(f"🔢 {len(scenes)} scenes detected.")
+    keyframes = extract_keyframes(video, scenes)
+    print(f"🖼️ {len(keyframes)} keyframes extracted.\n")
+    captions = [generate_scene_caption(f) for _, f in tqdm(keyframes, desc="Generating captions")]
+    summary  = generate_video_summary(captions)
+    print("\n--- Video Summary ---")
+    print(summary)
+    # 🧠 Let the LLM decide which scenes to remove based on captions
+    to_remove = filter_scenes_with_llm(captions, query, groq_llm)
+    print(f"\n🔴 Scenes to remove: {to_remove}")
+    if to_remove:
+        remove_scenes(video, scenes, to_remove)
+        print("✅ Trimmed video saved as `trimmed.mp4`.")
+    else:
+        print("⚠️ No matching scenes found; no trimming done.")
+    return to_remove  # Optional: return for external use
+# ─── 8. ENTRY POINT ─────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    if len(sys.argv)<3:
+        print("Usage: python main.py <video.mp4> \"your query here\"")
+        sys.exit(1)
+    run(sys.argv[1], sys.argv[2])

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Core dependencies
+gradio>=4.19.2
+opencv-python>=4.9.0.80
+python-dotenv>=1.0.0
+Pillow>=10.2.0
+spaces>=0.1.0
+# Video processing
+scenedetect>=0.6.3
+ffmpeg-python>=0.2.0
+# AI/ML models
+transformers>=4.37.2
+sentence-transformers>=2.5.1
+torch>=2.2.0
+# LLM and embeddings
+langchain-groq>=0.0.1
+langchain-core>=0.1.27
+# Utilities
+tqdm>=4.66.1
+numpy>=1.26.3