Spaces:

prithivMLmods
/

Gemma-3-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 17 days ago

Commit

51559fa

verified ·

1 Parent(s): dc7620f

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -43

app.py CHANGED Viewed

@@ -84,6 +84,15 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
@@ -209,15 +218,6 @@ def save_image(img: Image.Image) -> str:
     img.save(unique_name)
     return unique_name
-# -----------------------
-# GEMMA3-4B MULTIMODAL MODEL
-# -----------------------
-gemma3_model_id = "google/gemma-3-4b-it"
-gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
-    gemma3_model_id, device_map="auto"
-).eval()
-gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
 # -----------------------
 # MAIN GENERATION FUNCTION
 # -----------------------
@@ -235,8 +235,8 @@ def generate(
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
-    # Image Generation Branch (Stable Diffusion models)
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
         lower_text.startswith("@turbov3")):
@@ -288,53 +288,52 @@ def generate(
         yield gr.Image(image_path)
         return
-    # GEMMA3-4B Branch for Multimodal/Text Generation with Streaming
     if lower_text.startswith("@gemma3-4b"):
-        # Remove the gemma3 flag from the prompt.
         prompt_clean = re.sub(r"@gemma3-4b", "", text, flags=re.IGNORECASE).strip().strip('"')
         if files:
-            # If image files are provided, load them.
-            images = [load_image(f) for f in files]
-            messages = [{
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": image} for image in images],
-                    {"type": "text", "text": prompt_clean},
-                ]
-            }]
-        else:
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
-            ]
         inputs = gemma3_processor.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=True,
             return_dict=True, return_tensors="pt"
         ).to(gemma3_model.device, dtype=torch.bfloat16)
-        streamer = TextIteratorStreamer(
-            gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-        )
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-        }
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing with Gemma3-4b")
         for new_text in streamer:
             buffer += new_text
-            time.sleep(0.01)
             yield buffer
         return
-    # Otherwise, handle text/chat (and TTS) generation.
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)

     torch_dtype=torch.float16
 ).to("cuda").eval()
+# -----------------------
+# GEMMA3-4B MODEL SETUP (NEW FEATURE)
+# -----------------------
+gemma3_model_id = "google/gemma-3-4b-it"
+gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
+    gemma3_model_id, device_map="auto"
+).eval()
+gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     img.save(unique_name)
     return unique_name
 # -----------------------
 # MAIN GENERATION FUNCTION
 # -----------------------
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
+    # 1. IMAGE GENERATION COMMANDS (@lightningv5, @lightningv4, @turbov3)
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
         lower_text.startswith("@turbov3")):
         yield gr.Image(image_path)
         return
+    # 2. GEMMA3-4B MULTIMODAL GENERATION (NEW FEATURE)
     if lower_text.startswith("@gemma3-4b"):
+        # Remove the flag from the text prompt.
         prompt_clean = re.sub(r"@gemma3-4b", "", text, flags=re.IGNORECASE).strip().strip('"')
+        # Build messages: include a system message and user message.
+        messages = []
+        messages.append({
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a helpful assistant."}]
+        })
+        user_content = []
         if files:
+            # If images are uploaded, load them and add them to the message.
+            images = [load_image(image) for image in files]
+            for img in images:
+                user_content.append({"type": "image", "image": img})
+        # Add the text part.
+        user_content.append({"type": "text", "text": prompt_clean})
+        messages.append({
+            "role": "user",
+            "content": user_content
+        })
+        # Prepare inputs using Gemma3's processor.
         inputs = gemma3_processor.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=True,
             return_dict=True, return_tensors="pt"
         ).to(gemma3_model.device, dtype=torch.bfloat16)
+        input_len = inputs["input_ids"].shape[-1]
+        # Create a text streamer for incremental generation.
+        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Gemma3-4B")
         for new_text in streamer:
             buffer += new_text
             yield buffer
+        final_response = buffer
+        yield final_response
         return
+    # 3. TEXT & TTS GENERATION
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)