Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 12

Commit

016d36e

verified ·

1 Parent(s): a074d01

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -57

app.py CHANGED Viewed

@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -324,14 +323,6 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# ------------------------------------------------------------------------------
-# New Gemma3-4b Multimodal Feature (Image & Text)
-# ------------------------------------------------------------------------------
-from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
-gemma3_model_id = "google/gemma-3-4b-it"
-gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
-gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
 # Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -473,7 +464,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4, and now @gemma3-4b commands
 @spaces.GPU
 def generate(
@@ -493,8 +484,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
-      - **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -654,48 +644,6 @@ def generate(
             yield buffer
         return
-    # --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
-    if text.strip().lower().startswith("@gemma3-4b"):
-        question = text[len("@gemma3-4b"):].strip()
-        messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": "You are a helpful assistant."}]
-            },
-            {
-                "role": "user",
-                "content": []
-            }
-        ]
-        if files:
-            try:
-                # If file is already a PIL Image, use it; otherwise try opening it.
-                if isinstance(files[0], Image.Image):
-                    image = files[0]
-                else:
-                    image = Image.open(files[0])
-                messages[1]["content"].append({"type": "image", "image": image})
-            except Exception as e:
-                yield f"Error processing image: {str(e)}"
-                return
-        messages[1]["content"].append({"type": "text", "text": question})
-        inputs = gemma3_processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=True,
-            return_dict=True, return_tensors="pt"
-        ).to(gemma3_model.device, dtype=torch.bfloat16)
-        input_len = inputs["input_ids"].shape[-1]
-        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
-        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing Gemma3-4b Multimodal")
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer
-        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -785,10 +733,10 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
-        [{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
-        [{"text": "@gemma3-4b Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
         [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
         [{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         ["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
@@ -809,7 +757,7 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
-        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @gemma3-4b, @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     torch_dtype=torch.float16
 ).to("cuda").eval()
 # Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             yield buffer
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
         [{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
+        [{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
+        [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         ["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
+        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,