Spaces:

prithivMLmods
/

Gemma-3-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 14 days ago

Commit

4e7ff73

verified ·

1 Parent(s): e7c8feb

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -50

app.py CHANGED Viewed

@@ -114,10 +114,9 @@ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 dtype = torch.float16 if device.type == "cuda" else torch.float32
-# STABLE DIFFUSION IMAGE GENERATION MODELS
 if torch.cuda.is_available():
-    # Lightning 5 model
     pipe = StableDiffusionXLPipeline.from_pretrained(
         "SG161222/RealVisXL_V5.0_Lightning",
         torch_dtype=dtype,
@@ -133,24 +132,6 @@ if torch.cuda.is_available():
     if USE_TORCH_COMPILE:
         pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
         print("Model RealVisXL_V5.0_Lightning Compiled!")
-    # Lightning 4 model
-    pipe2 = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V4.0_Lightning",
-        torch_dtype=dtype,
-        use_safetensors=True,
-        add_watermarker=False,
-    ).to(device)
-    pipe2.text_encoder = pipe2.text_encoder.half()
-    if ENABLE_CPU_OFFLOAD:
-        pipe2.enable_model_cpu_offload()
-    else:
-        pipe2.to(device)
-        print("Loaded RealVisXL_V4.0 on Device!")
-    if USE_TORCH_COMPILE:
-        pipe2.unet = torch.compile(pipe2.unet, mode="reduce-overhead", fullgraph=True)
-        print("Model RealVisXL_V4.0 Compiled!")
 else:
     pipe = StableDiffusionXLPipeline.from_pretrained(
         "SG161222/RealVisXL_V5.0_Lightning",
@@ -158,19 +139,11 @@ else:
         use_safetensors=True,
         add_watermarker=False
     ).to(device)
-    pipe2 = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V4.0_Lightning",
-        torch_dtype=dtype,
-        use_safetensors=True,
-        add_watermarker=False,
-    ).to(device)
-    print("Running on CPU; models loaded in float32.")
 DEFAULT_MODEL = "Lightning 5"
-MODEL_CHOICES = [DEFAULT_MODEL, "Lightning 4"]
 models = {
-    "Lightning 5": pipe,
-    "Lightning 4": pipe2
 }
 def save_image(img: Image.Image) -> str:
@@ -223,21 +196,10 @@ def generate(
     lower_text = text.lower().strip()
-    # IMAGE GENERATION BRANCH (Stable Diffusion models)
-    if (lower_text.startswith("@lightningv5") or
-        lower_text.startswith("@lightningv4")):
-        # Determine model choice based on flag.
-        model_choice = None
-        if "@lightningv5" in lower_text:
-            model_choice = "Lightning 5"
-        elif "@lightningv4" in lower_text:
-            model_choice = "Lightning 4"
         # Remove the model flag from the prompt.
-        prompt_clean = re.sub(r"@lightningv5", "", text, flags=re.IGNORECASE)
-        prompt_clean = re.sub(r"@lightningv4", "", prompt_clean, flags=re.IGNORECASE)
-        prompt_clean = prompt_clean.strip().strip('"')
         # Default parameters for single image generation.
         width = 1024
@@ -264,9 +226,8 @@ def generate(
         if device.type == "cuda":
             torch.cuda.empty_cache()
-        selected_pipe = models.get(model_choice, pipe)
         yield progress_bar_html("Processing Image Generation")
-        images = selected_pipe(**options).images
         image_path = save_image(images[0])
         yield gr.Image(image_path)
         return
@@ -321,7 +282,7 @@ def generate(
                 yield buffer
             return
-    # NEW: GEMMA3-4B VIDEO Branch
     if lower_text.startswith("@video-infer"):
         # Remove the video flag from the prompt.
         prompt_clean = re.sub(r"@video-infer", "", text, flags=re.IGNORECASE).strip().strip('"')
@@ -336,7 +297,6 @@ def generate(
             # Append each frame as an image with a timestamp label.
             for frame in frames:
                 image, timestamp = frame
-                # Save the frame image to a temporary unique filename.
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
                 image.save(image_path)
                 messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
@@ -465,14 +425,13 @@ demo = gr.ChatInterface(
         ['@lightningv5 Chocolate dripping from a donut'],
         ["Python Program for Array Rotation"],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        ['@lightningv4 Cat holding a sign that says hello world'],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
     description="# **Gemma 3 `@gemma3-4b, @video-infer for video understanding`**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="@gemma3-4b for multimodal, @video-infer for video, @lightningv5, @lightningv4 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 dtype = torch.float16 if device.type == "cuda" else torch.float32
+# STABLE DIFFUSION IMAGE GENERATION MODEL (Lightning 5 only)
 if torch.cuda.is_available():
     pipe = StableDiffusionXLPipeline.from_pretrained(
         "SG161222/RealVisXL_V5.0_Lightning",
         torch_dtype=dtype,
     if USE_TORCH_COMPILE:
         pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
         print("Model RealVisXL_V5.0_Lightning Compiled!")
 else:
     pipe = StableDiffusionXLPipeline.from_pretrained(
         "SG161222/RealVisXL_V5.0_Lightning",
         use_safetensors=True,
         add_watermarker=False
     ).to(device)
+    print("Running on CPU; model loaded in float32.")
 DEFAULT_MODEL = "Lightning 5"
 models = {
+    "Lightning 5": pipe
 }
 def save_image(img: Image.Image) -> str:
     lower_text = text.lower().strip()
+    # IMAGE GENERATION BRANCH (Stable Diffusion model using @lightningv5)
+    if lower_text.startswith("@lightningv5"):
         # Remove the model flag from the prompt.
+        prompt_clean = re.sub(r"@lightningv5", "", text, flags=re.IGNORECASE).strip().strip('"')
         # Default parameters for single image generation.
         width = 1024
         if device.type == "cuda":
             torch.cuda.empty_cache()
         yield progress_bar_html("Processing Image Generation")
+        images = models["Lightning 5"](**options).images
         image_path = save_image(images[0])
         yield gr.Image(image_path)
         return
                 yield buffer
             return
+    # GEMMA3-4B VIDEO Branch
     if lower_text.startswith("@video-infer"):
         # Remove the video flag from the prompt.
         prompt_clean = re.sub(r"@video-infer", "", text, flags=re.IGNORECASE).strip().strip('"')
             # Append each frame as an image with a timestamp label.
             for frame in frames:
                 image, timestamp = frame
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
                 image.save(image_path)
                 messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         ['@lightningv5 Chocolate dripping from a donut'],
         ["Python Program for Array Rotation"],
         ["@tts1 Who is Nikola Tesla, and why did he die?"],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
     description="# **Gemma 3 `@gemma3-4b, @video-infer for video understanding`**",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="@gemma3-4b for multimodal, @video-infer for video, @lightningv5 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )