Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
39 |
os.system('pip install backoff')
|
40 |
-
|
41 |
# Global constants and helper functions
|
42 |
|
43 |
MAX_SEED = np.iinfo(np.int32).max
|
@@ -324,14 +323,6 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
324 |
torch_dtype=torch.float16
|
325 |
).to("cuda").eval()
|
326 |
|
327 |
-
# ------------------------------------------------------------------------------
|
328 |
-
# New Gemma3-4b Multimodal Feature (Image & Text)
|
329 |
-
# ------------------------------------------------------------------------------
|
330 |
-
from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
|
331 |
-
gemma3_model_id = "google/gemma-3-4b-it"
|
332 |
-
gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
|
333 |
-
gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
|
334 |
-
|
335 |
# Asynchronous text-to-speech
|
336 |
|
337 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
@@ -473,7 +464,7 @@ def detect_objects(image: np.ndarray):
|
|
473 |
|
474 |
return Image.fromarray(annotated_image)
|
475 |
|
476 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo,
|
477 |
|
478 |
@spaces.GPU
|
479 |
def generate(
|
@@ -493,8 +484,7 @@ def generate(
|
|
493 |
- "@web": triggers a web search or webpage visit.
|
494 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
495 |
- "@yolo": triggers object detection using YOLO.
|
496 |
-
- "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model
|
497 |
-
- **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
|
498 |
"""
|
499 |
text = input_dict["text"]
|
500 |
files = input_dict.get("files", [])
|
@@ -654,48 +644,6 @@ def generate(
|
|
654 |
yield buffer
|
655 |
return
|
656 |
|
657 |
-
# --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
|
658 |
-
if text.strip().lower().startswith("@gemma3-4b"):
|
659 |
-
question = text[len("@gemma3-4b"):].strip()
|
660 |
-
messages = [
|
661 |
-
{
|
662 |
-
"role": "system",
|
663 |
-
"content": [{"type": "text", "text": "You are a helpful assistant."}]
|
664 |
-
},
|
665 |
-
{
|
666 |
-
"role": "user",
|
667 |
-
"content": []
|
668 |
-
}
|
669 |
-
]
|
670 |
-
if files:
|
671 |
-
try:
|
672 |
-
# If file is already a PIL Image, use it; otherwise try opening it.
|
673 |
-
if isinstance(files[0], Image.Image):
|
674 |
-
image = files[0]
|
675 |
-
else:
|
676 |
-
image = Image.open(files[0])
|
677 |
-
messages[1]["content"].append({"type": "image", "image": image})
|
678 |
-
except Exception as e:
|
679 |
-
yield f"Error processing image: {str(e)}"
|
680 |
-
return
|
681 |
-
messages[1]["content"].append({"type": "text", "text": question})
|
682 |
-
inputs = gemma3_processor.apply_chat_template(
|
683 |
-
messages, add_generation_prompt=True, tokenize=True,
|
684 |
-
return_dict=True, return_tensors="pt"
|
685 |
-
).to(gemma3_model.device, dtype=torch.bfloat16)
|
686 |
-
input_len = inputs["input_ids"].shape[-1]
|
687 |
-
streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
|
688 |
-
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
|
689 |
-
thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
|
690 |
-
thread.start()
|
691 |
-
buffer = ""
|
692 |
-
yield progress_bar_html("Processing Gemma3-4b Multimodal")
|
693 |
-
for new_text in streamer:
|
694 |
-
buffer += new_text
|
695 |
-
time.sleep(0.01)
|
696 |
-
yield buffer
|
697 |
-
return
|
698 |
-
|
699 |
# --- Text and TTS branch ---
|
700 |
tts_prefix = "@tts"
|
701 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
@@ -785,10 +733,10 @@ demo = gr.ChatInterface(
|
|
785 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
786 |
],
|
787 |
examples=[
|
788 |
-
[{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
|
789 |
-
[{"text": "@gemma3-4b Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
|
790 |
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
791 |
[{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
|
|
|
|
|
792 |
["@image Chocolate dripping from a donut"],
|
793 |
["@3d A birthday cupcake with cherry"],
|
794 |
["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
|
@@ -809,7 +757,7 @@ demo = gr.ChatInterface(
|
|
809 |
label="Query Input",
|
810 |
file_types=["image", "audio"],
|
811 |
file_count="multiple",
|
812 |
-
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @
|
813 |
),
|
814 |
stop_btn="Stop Generation",
|
815 |
multimodal=True,
|
|
|
37 |
from diffusers.utils import export_to_ply
|
38 |
|
39 |
os.system('pip install backoff')
|
|
|
40 |
# Global constants and helper functions
|
41 |
|
42 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
323 |
torch_dtype=torch.float16
|
324 |
).to("cuda").eval()
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
# Asynchronous text-to-speech
|
327 |
|
328 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
464 |
|
465 |
return Image.fromarray(annotated_image)
|
466 |
|
467 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
|
468 |
|
469 |
@spaces.GPU
|
470 |
def generate(
|
|
|
484 |
- "@web": triggers a web search or webpage visit.
|
485 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
486 |
- "@yolo": triggers object detection using YOLO.
|
487 |
+
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
|
|
|
488 |
"""
|
489 |
text = input_dict["text"]
|
490 |
files = input_dict.get("files", [])
|
|
|
644 |
yield buffer
|
645 |
return
|
646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
# --- Text and TTS branch ---
|
648 |
tts_prefix = "@tts"
|
649 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
733 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
734 |
],
|
735 |
examples=[
|
|
|
|
|
736 |
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
737 |
[{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
|
738 |
+
[{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
|
739 |
+
[{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
|
740 |
["@image Chocolate dripping from a donut"],
|
741 |
["@3d A birthday cupcake with cherry"],
|
742 |
["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
|
|
|
757 |
label="Query Input",
|
758 |
file_types=["image", "audio"],
|
759 |
file_count="multiple",
|
760 |
+
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
|
761 |
),
|
762 |
stop_btn="Stop Generation",
|
763 |
multimodal=True,
|