Spaces:

prithivMLmods
/

Gemma-3-Multimodal

Running on Zero

App Files Files Community

prithivMLmods commited on 3 days ago

Commit

55a7e0e

verified ·

1 Parent(s): ed406dc

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -118

app.py CHANGED Viewed

@@ -15,12 +15,10 @@ from PIL import Image
 import cv2
 from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-    Qwen2VLForConditionalGeneration,
     AutoProcessor,
     Gemma3ForConditionalGeneration,
 )
 from transformers.image_utils import load_image
@@ -38,7 +36,7 @@ def progress_bar_html(label: str) -> str:
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
     <div style="width: 110px; height: 5px; background-color: #F0FFF0; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #00FF00 ; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
@@ -49,18 +47,7 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-# TEXT MODEL
-model_id = "prithivMLmods/FastThink-0.5B-Tiny"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-model.eval()
-# MULTIMODAL (OCR) MODELS
 MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_VL, trust_remote_code=True)
@@ -102,7 +89,8 @@ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 dtype = torch.float16 if device.type == "cuda" else torch.float32
-# GEMMA3-4B MULTIMODAL MODEL
 gemma3_model_id = "google/gemma-3-4b-it"  # alternative: google/gemma-3-12b-it
 gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -111,6 +99,7 @@ gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
 gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
 # VIDEO PROCESSING HELPER
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -144,15 +133,12 @@ def generate(
 ):
     text = input_dict["text"]
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
-    # GEMMA3-4B TEXT & MULTIMODAL (image) Branch
-    if lower_text.startswith("@gemma3"):
-        # Remove the gemma3 flag from the prompt.
-        prompt_clean = re.sub(r"@gemma3", "", text, flags=re.IGNORECASE).strip().strip('"')
         if files:
-            # If image files are provided, load them.
             images = [load_image(f) for f in files]
             messages = [{
                 "role": "user",
@@ -161,18 +147,18 @@ def generate(
                     {"type": "text", "text": prompt_clean},
                 ]
             }]
         else:
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
             ]
-        inputs = gemma3_processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True,
-            return_dict=True, return_tensors="pt"
-        ).to(gemma3_model.device, dtype=torch.bfloat16)
-        streamer = TextIteratorStreamer(
-            gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-        )
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
@@ -183,47 +169,107 @@ def generate(
             "top_k": top_k,
             "repetition_penalty": repetition_penalty,
         }
-        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing with Gemma3")
         for new_text in streamer:
             buffer += new_text
             time.sleep(0.01)
             yield buffer
         return
-    # GEMMA3-4B VIDEO Branch
-    if lower_text.startswith("@video-infer"):
-        # Remove the video flag from the prompt.
-        prompt_clean = re.sub(r"@video-infer", "", text, flags=re.IGNORECASE).strip().strip('"')
-        if files:
-            # Assume the first file is a video.
             video_path = files[0]
             frames = downsample_video(video_path)
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
             ]
-            # Append each frame as an image with a timestamp label.
             for frame in frames:
                 image, timestamp = frame
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
                 image.save(image_path)
                 messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
                 messages[1]["content"].append({"type": "image", "url": image_path})
         else:
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
-            ]
         inputs = gemma3_processor.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=True,
             return_dict=True, return_tensors="pt"
         ).to(gemma3_model.device, dtype=torch.bfloat16)
-        streamer = TextIteratorStreamer(
-            gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-        )
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
@@ -236,70 +282,16 @@ def generate(
         }
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing video with Gemma3")
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Otherwise, handle text/chat generation.
-    conversation = clean_chat_history(chat_history)
-    conversation.append({"role": "user", "content": text})
-    if files:
-        images = [load_image(image) for image in files] if len(files) > 1 else [load_image(files[0])]
-        messages = [{
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing with Qwen2VL OCR")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    else:
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "top_p": top_p,
-            "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
-            "repetition_penalty": repetition_penalty,
-        }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -312,7 +304,7 @@ demo = gr.ChatInterface(
     examples=[
         [
             {
-                "text": "@gemma3 Create a short story based on the images.",
                 "files": [
                     "examples/1111.jpg",
                     "examples/2222.jpg",
@@ -320,24 +312,24 @@ demo = gr.ChatInterface(
                 ],
             }
         ],
-        [{"text": "@gemma3 Explain the Image", "files": ["examples/3.jpg"]}],
-        [{"text": "@video-infer Explain the content of the Advertisement", "files": ["examples/videoplayback.mp4"]}],
-        [{"text": "@gemma3 Which movie character is this?", "files": ["examples/9999.jpg"]}],
-        ["@gemma3 Explain Critical Temperature of Substance"],
-        [{"text": "@gemma3 Transcription of the letter", "files": ["examples/222.png"]}],
-        [{"text": "@video-infer Explain the content of the video in detail", "files": ["examples/breakfast.mp4"]}],
-        [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
-        [{"text": "@video-infer Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
-        [{"text": "@video-infer Summarize the events in this video", "files": ["examples/sky.mp4"]}],
-        [{"text": "@video-infer What is in the video ?", "files": ["examples/redlight.mp4"]}],
         ["Python Program for Array Rotation"],
-        ["@gemma3 Explain Critical Temperature of Substance"]
     ],
     cache_examples=False,
     type="messages",
-    description="# **Gemma 3 `@gemma3, @video-infer for video understanding`**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Tag--> @gemma3 for multimodal, @video-infer for video !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import cv2
 from transformers import (
     AutoProcessor,
     Gemma3ForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+    TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
     <div style="width: 110px; height: 5px; background-color: #F0FFF0; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #00FF00; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
 </style>
     '''
+# Qwen2-VL (for optional image inference)
 MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID_VL, trust_remote_code=True)
 dtype = torch.float16 if device.type == "cuda" else torch.float32
+# Gemma3 Model (default for text, image, & video inference)
 gemma3_model_id = "google/gemma-3-4b-it"  # alternative: google/gemma-3-12b-it
 gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
 gemma3_processor = AutoProcessor.from_pretrained(gemma3_model_id)
 # VIDEO PROCESSING HELPER
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
 ):
     text = input_dict["text"]
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
+    # ----- Qwen2-VL branch (triggered with @qwen2-vl) -----
+    if lower_text.startswith("@qwen2-vl"):
+        prompt_clean = re.sub(r"@qwen2-vl", "", text, flags=re.IGNORECASE).strip().strip('"')
         if files:
             images = [load_image(f) for f in files]
             messages = [{
                 "role": "user",
                     {"type": "text", "text": prompt_clean},
                 ]
             }]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         else:
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
             ]
+            inputs = processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt"
+            ).to("cuda", dtype=torch.float16)
+        streamer = TextIteratorStreamer(processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "top_k": top_k,
             "repetition_penalty": repetition_penalty,
         }
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing with Qwen2VL")
         for new_text in streamer:
             buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
         return
+    # ----- Default branch: Gemma3 (for text, image, & video inference) -----
+    if files:
+        # Check if any provided file is a video based on extension.
+        video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
+        if any(str(f).lower().endswith(video_extensions) for f in files):
+            # Video inference branch.
+            prompt_clean = re.sub(r"@video-infer", "", text, flags=re.IGNORECASE).strip().strip('"')
             video_path = files[0]
             frames = downsample_video(video_path)
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
             ]
+            # Append each frame (with its timestamp) to the conversation.
             for frame in frames:
                 image, timestamp = frame
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
                 image.save(image_path)
                 messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
                 messages[1]["content"].append({"type": "image", "url": image_path})
+            inputs = gemma3_processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt"
+            ).to(gemma3_model.device, dtype=torch.bfloat16)
+            streamer = TextIteratorStreamer(gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "repetition_penalty": repetition_penalty,
+            }
+            thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            yield progress_bar_html("Processing video with Gemma3")
+            for new_text in streamer:
+                buffer += new_text
+                time.sleep(0.01)
+                yield buffer
+            return
         else:
+            # Image inference branch.
+            prompt_clean = re.sub(r"@gemma3", "", text, flags=re.IGNORECASE).strip().strip('"')
+            images = [load_image(f) for f in files]
+            messages = [{
+                "role": "user",
+                "content": [
+                    *[{"type": "image", "image": image} for image in images],
+                    {"type": "text", "text": prompt_clean},
+                ]
+            }]
+            inputs = gemma3_processor.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt"
+            ).to(gemma3_model.device, dtype=torch.bfloat16)
+            streamer = TextIteratorStreamer(gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k,
+                "repetition_penalty": repetition_penalty,
+            }
+            thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            yield progress_bar_html("Processing with Gemma3")
+            for new_text in streamer:
+                buffer += new_text
+                time.sleep(0.01)
+                yield buffer
+            return
+    else:
+        # Text-only inference branch.
+        messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {"role": "user", "content": [{"type": "text", "text": text}]}
+        ]
         inputs = gemma3_processor.apply_chat_template(
             messages, add_generation_prompt=True, tokenize=True,
             return_dict=True, return_tensors="pt"
         ).to(gemma3_model.device, dtype=torch.bfloat16)
+        streamer = TextIteratorStreamer(gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
         }
         thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
         thread.start()
         outputs = []
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
+# Gradio Interface
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
     examples=[
         [
             {
+                "text": "Create a short story based on the images.",
                 "files": [
                     "examples/1111.jpg",
                     "examples/2222.jpg",
                 ],
             }
         ],
+        [{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
+        [{"text": "Explain the content of the Advertisement", "files": ["examples/videoplayback.mp4"]}],
+        [{"text": "Which movie character is this?", "files": ["examples/9999.jpg"]}],
+        ["Explain Critical Temperature of Substance"],
+        [{"text": "@qwen2-vl Transcription of the letter", "files": ["examples/222.png"]}],
+        [{"text": "Explain the content of the video in detail", "files": ["examples/breakfast.mp4"]}],
+        [{"text": "Describe the video", "files": ["examples/Missing.mp4"]}],
+        [{"text": "Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
+        [{"text": "Summarize the events in this video", "files": ["examples/sky.mp4"]}],
+        [{"text": "What is in the video ?", "files": ["examples/redlight.mp4"]}],
         ["Python Program for Array Rotation"],
+        ["Explain Critical Temperature of Substance"]
     ],
     cache_examples=False,
     type="messages",
+    description="# **Gemma 3 Multimodal** \n`Use @qwen2-vl to switch to Qwen2-VL OCR for image inference and @video-infer for video input`",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Tag with @qwen2-vl for Qwen2-VL inference if needed."),
     stop_btn="Stop Generation",
     multimodal=True,
 )