prithivMLmods commited on
Commit
016d36e
·
verified ·
1 Parent(s): a074d01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -57
app.py CHANGED
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
37
  from diffusers.utils import export_to_ply
38
 
39
  os.system('pip install backoff')
40
-
41
  # Global constants and helper functions
42
 
43
  MAX_SEED = np.iinfo(np.int32).max
@@ -324,14 +323,6 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
324
  torch_dtype=torch.float16
325
  ).to("cuda").eval()
326
 
327
- # ------------------------------------------------------------------------------
328
- # New Gemma3-4b Multimodal Feature (Image & Text)
329
- # ------------------------------------------------------------------------------
330
- from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
331
- gemma3_model_id = "google/gemma-3-4b-it"
332
- gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
333
- gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
334
-
335
  # Asynchronous text-to-speech
336
 
337
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -473,7 +464,7 @@ def detect_objects(image: np.ndarray):
473
 
474
  return Image.fromarray(annotated_image)
475
 
476
- # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4, and now @gemma3-4b commands
477
 
478
  @spaces.GPU
479
  def generate(
@@ -493,8 +484,7 @@ def generate(
493
  - "@web": triggers a web search or webpage visit.
494
  - "@rAgent": initiates a reasoning chain using Llama mode.
495
  - "@yolo": triggers object detection using YOLO.
496
- - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
497
- - **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
498
  """
499
  text = input_dict["text"]
500
  files = input_dict.get("files", [])
@@ -654,48 +644,6 @@ def generate(
654
  yield buffer
655
  return
656
 
657
- # --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
658
- if text.strip().lower().startswith("@gemma3-4b"):
659
- question = text[len("@gemma3-4b"):].strip()
660
- messages = [
661
- {
662
- "role": "system",
663
- "content": [{"type": "text", "text": "You are a helpful assistant."}]
664
- },
665
- {
666
- "role": "user",
667
- "content": []
668
- }
669
- ]
670
- if files:
671
- try:
672
- # If file is already a PIL Image, use it; otherwise try opening it.
673
- if isinstance(files[0], Image.Image):
674
- image = files[0]
675
- else:
676
- image = Image.open(files[0])
677
- messages[1]["content"].append({"type": "image", "image": image})
678
- except Exception as e:
679
- yield f"Error processing image: {str(e)}"
680
- return
681
- messages[1]["content"].append({"type": "text", "text": question})
682
- inputs = gemma3_processor.apply_chat_template(
683
- messages, add_generation_prompt=True, tokenize=True,
684
- return_dict=True, return_tensors="pt"
685
- ).to(gemma3_model.device, dtype=torch.bfloat16)
686
- input_len = inputs["input_ids"].shape[-1]
687
- streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
688
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
689
- thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
690
- thread.start()
691
- buffer = ""
692
- yield progress_bar_html("Processing Gemma3-4b Multimodal")
693
- for new_text in streamer:
694
- buffer += new_text
695
- time.sleep(0.01)
696
- yield buffer
697
- return
698
-
699
  # --- Text and TTS branch ---
700
  tts_prefix = "@tts"
701
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -785,10 +733,10 @@ demo = gr.ChatInterface(
785
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
786
  ],
787
  examples=[
788
- [{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
789
- [{"text": "@gemma3-4b Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
790
  [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
791
  [{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
 
 
792
  ["@image Chocolate dripping from a donut"],
793
  ["@3d A birthday cupcake with cherry"],
794
  ["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
@@ -809,7 +757,7 @@ demo = gr.ChatInterface(
809
  label="Query Input",
810
  file_types=["image", "audio"],
811
  file_count="multiple",
812
- placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @gemma3-4b, @rAgent, @web, @yolo, default [plain text]"
813
  ),
814
  stop_btn="Stop Generation",
815
  multimodal=True,
 
37
  from diffusers.utils import export_to_ply
38
 
39
  os.system('pip install backoff')
 
40
  # Global constants and helper functions
41
 
42
  MAX_SEED = np.iinfo(np.int32).max
 
323
  torch_dtype=torch.float16
324
  ).to("cuda").eval()
325
 
 
 
 
 
 
 
 
 
326
  # Asynchronous text-to-speech
327
 
328
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 
464
 
465
  return Image.fromarray(annotated_image)
466
 
467
+ # Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
468
 
469
  @spaces.GPU
470
  def generate(
 
484
  - "@web": triggers a web search or webpage visit.
485
  - "@rAgent": initiates a reasoning chain using Llama mode.
486
  - "@yolo": triggers object detection using YOLO.
487
+ - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
 
488
  """
489
  text = input_dict["text"]
490
  files = input_dict.get("files", [])
 
644
  yield buffer
645
  return
646
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647
  # --- Text and TTS branch ---
648
  tts_prefix = "@tts"
649
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
 
733
  gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
734
  ],
735
  examples=[
 
 
736
  [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
737
  [{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
738
+ [{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
739
+ [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
740
  ["@image Chocolate dripping from a donut"],
741
  ["@3d A birthday cupcake with cherry"],
742
  ["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
 
757
  label="Query Input",
758
  file_types=["image", "audio"],
759
  file_count="multiple",
760
+ placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
761
  ),
762
  stop_btn="Stop Generation",
763
  multimodal=True,