Spaces:

prithivMLmods
/

FLUX-LoRA-DLC2

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 12

Commit

e780483

verified ·

1 Parent(s): 27e1a3a

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -19

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ import torch
 import numpy as np
 from PIL import Image
 import edge_tts
 from transformers import (
     AutoModelForCausalLM,
@@ -113,7 +114,6 @@ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 dtype = torch.float16 if device.type == "cuda" else torch.float32
 # STABLE DIFFUSION IMAGE GENERATION MODELS
 if torch.cuda.is_available():
@@ -201,7 +201,6 @@ def save_image(img: Image.Image) -> str:
     img.save(unique_name)
     return unique_name
 # GEMMA3-4B MULTIMODAL MODEL
 gemma3_model_id = "google/gemma-3-4b-it"
@@ -210,6 +209,25 @@ gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
 ).eval()
 gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
 # MAIN GENERATION FUNCTION
@@ -228,7 +246,7 @@ def generate(
     lower_text = text.lower().strip()
-    # Image Generation Branch (Stable Diffusion models)
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
         lower_text.startswith("@turbov3")):
@@ -280,20 +298,76 @@ def generate(
         yield gr.Image(image_path)
         return
-    # GEMMA3-4B Branch for Multimodal/Text Generation with Streaming
     if lower_text.startswith("@gemma3-4b"):
-        # Remove the gemma3 flag from the prompt.
-        prompt_clean = re.sub(r"@gemma3-4b", "", text, flags=re.IGNORECASE).strip().strip('"')
-        if files:
-            # If image files are provided, load them.
-            images = [load_image(f) for f in files]
-            messages = [{
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": image} for image in images],
-                    {"type": "text", "text": prompt_clean},
                 ]
-            }]
         else:
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -319,7 +393,7 @@ def generate(
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing with Gemma3-4b")
         for new_text in streamer:
             buffer += new_text
             time.sleep(0.01)
@@ -408,7 +482,9 @@ demo = gr.ChatInterface(
     ],
     examples=[
         [{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
-        [{"text": "@gemma3-4b What's funny about this image ?", "files": ["examples/images.jpeg"]}],
         [{"text": "@gemma3-4b Where do the major drought happen?", "files": ["examples/111.png"]}],
         [{"text": "@gemma3-4b Transcription of the letter", "files": ["examples/222.png"]}],
         ['@lightningv5 Chocolate dripping from a donut'],
@@ -420,9 +496,9 @@ demo = gr.ChatInterface(
     ],
     cache_examples=False,
     type="messages",
-    description="# **Imagineo Chat `@gemma3-4b 'prompt..', @lightningv5, etc..`**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="use the tags  @gemma3-4b for multimodal, @lightningv5, @lightningv4 @turbov3 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import numpy as np
 from PIL import Image
 import edge_tts
+import cv2  # New import for video processing
 from transformers import (
     AutoModelForCausalLM,
 dtype = torch.float16 if device.type == "cuda" else torch.float32
 # STABLE DIFFUSION IMAGE GENERATION MODELS
 if torch.cuda.is_available():
     img.save(unique_name)
     return unique_name
 # GEMMA3-4B MULTIMODAL MODEL
 gemma3_model_id = "google/gemma-3-4b-it"
 ).eval()
 gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
+# VIDEO PROCESSING HELPER
+def downsample_video(video_path):
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    # Sample 10 evenly spaced frames.
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            # Convert from BGR to RGB and then to PIL Image.
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
 # MAIN GENERATION FUNCTION
     lower_text = text.lower().strip()
+    # IMAGE GENERATION BRANCH (Stable Diffusion models)
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
         lower_text.startswith("@turbov3")):
         yield gr.Image(image_path)
         return
+    # GEMMA3-4B TEXT & MULTIMODAL (image) Branch
     if lower_text.startswith("@gemma3-4b"):
+        # If it is video, let the dedicated branch handle it.
+        if lower_text.startswith("@gemma3-4b-video"):
+            pass  # video branch is handled below.
+        else:
+            # Remove the gemma3 flag from the prompt.
+            prompt_clean = re.sub(r"@gemma3-4b", "", text, flags=re.IGNORECASE).strip().strip('"')
+            if files:
+                # If image files are provided, load them.
+                images = [load_image(f) for f in files]
+                messages = [{
+                    "role": "user",
+                    "content": [
+                        *[{"type": "image", "image": image} for image in images],
+                        {"type": "text", "text": prompt_clean},
+                    ]
+                }]
+            else:
+                messages = [
+                    {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                    {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
                 ]
+            inputs = gemma3_processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt"
+            ).to(gemma3_model.device, dtype=torch.bfloat16)
+            streamer = TextIteratorStreamer(
+                gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+            )
+            generation_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "repetition_penalty": repetition_penalty,
+            }
+            thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            yield progress_bar_html("Processing with Gemma3-4b")
+            for new_text in streamer:
+                buffer += new_text
+                time.sleep(0.01)
+                yield buffer
+            return
+    # NEW: GEMMA3-4B VIDEO Branch
+    if lower_text.startswith("@gemma3-4b-video"):
+        # Remove the video flag from the prompt.
+        prompt_clean = re.sub(r"@gemma3-4b-video", "", text, flags=re.IGNORECASE).strip().strip('"')
+        if files:
+            # Assume the first file is a video.
+            video_path = files[0]
+            frames = downsample_video(video_path)
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
+            ]
+            # Append each frame as an image with a timestamp label.
+            for frame in frames:
+                image, timestamp = frame
+                # Save the frame image to a temporary unique filename.
+                image_path = f"video_frame_{uuid.uuid4().hex}.png"
+                image.save(image_path)
+                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+                messages[1]["content"].append({"type": "image", "url": image_path})
         else:
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Gemma3-4b Video")
         for new_text in streamer:
             buffer += new_text
             time.sleep(0.01)
     ],
     examples=[
         [{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
+        [{"text": "@gemma3-4b-video Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
+        [{"text": "@gemma3-4b-video Summarize the events in this video", "files": ["examples/sky.mp4"]}],
+        [{"text": "@gemma3-4b-video What is in the video ?", "files": ["examples/redlight.mp4"]}],
         [{"text": "@gemma3-4b Where do the major drought happen?", "files": ["examples/111.png"]}],
         [{"text": "@gemma3-4b Transcription of the letter", "files": ["examples/222.png"]}],
         ['@lightningv5 Chocolate dripping from a donut'],
     ],
     cache_examples=False,
     type="messages",
+    description="# **Imagineo Chat `@gemma3-4b 'prompt..', @gemma3-4b-video, @lightningv5, etc..`**",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="use the tags @gemma3-4b for multimodal, @gemma3-4b-video for video, @lightningv5, @lightningv4, @turbov3 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )