Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,7 +37,6 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
| 40 |
-
|
| 41 |
# Global constants and helper functions
|
| 42 |
|
| 43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -324,14 +323,6 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
| 324 |
torch_dtype=torch.float16
|
| 325 |
).to("cuda").eval()
|
| 326 |
|
| 327 |
-
# ------------------------------------------------------------------------------
|
| 328 |
-
# New Gemma3-4b Multimodal Feature (Image & Text)
|
| 329 |
-
# ------------------------------------------------------------------------------
|
| 330 |
-
from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
|
| 331 |
-
gemma3_model_id = "google/gemma-3-4b-it"
|
| 332 |
-
gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
|
| 333 |
-
gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
|
| 334 |
-
|
| 335 |
# Asynchronous text-to-speech
|
| 336 |
|
| 337 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
@@ -473,7 +464,7 @@ def detect_objects(image: np.ndarray):
|
|
| 473 |
|
| 474 |
return Image.fromarray(annotated_image)
|
| 475 |
|
| 476 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo,
|
| 477 |
|
| 478 |
@spaces.GPU
|
| 479 |
def generate(
|
|
@@ -493,8 +484,7 @@ def generate(
|
|
| 493 |
- "@web": triggers a web search or webpage visit.
|
| 494 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 495 |
- "@yolo": triggers object detection using YOLO.
|
| 496 |
-
- "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model
|
| 497 |
-
- **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
|
| 498 |
"""
|
| 499 |
text = input_dict["text"]
|
| 500 |
files = input_dict.get("files", [])
|
|
@@ -654,48 +644,6 @@ def generate(
|
|
| 654 |
yield buffer
|
| 655 |
return
|
| 656 |
|
| 657 |
-
# --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
|
| 658 |
-
if text.strip().lower().startswith("@gemma3-4b"):
|
| 659 |
-
question = text[len("@gemma3-4b"):].strip()
|
| 660 |
-
messages = [
|
| 661 |
-
{
|
| 662 |
-
"role": "system",
|
| 663 |
-
"content": [{"type": "text", "text": "You are a helpful assistant."}]
|
| 664 |
-
},
|
| 665 |
-
{
|
| 666 |
-
"role": "user",
|
| 667 |
-
"content": []
|
| 668 |
-
}
|
| 669 |
-
]
|
| 670 |
-
if files:
|
| 671 |
-
try:
|
| 672 |
-
# If file is already a PIL Image, use it; otherwise try opening it.
|
| 673 |
-
if isinstance(files[0], Image.Image):
|
| 674 |
-
image = files[0]
|
| 675 |
-
else:
|
| 676 |
-
image = Image.open(files[0])
|
| 677 |
-
messages[1]["content"].append({"type": "image", "image": image})
|
| 678 |
-
except Exception as e:
|
| 679 |
-
yield f"Error processing image: {str(e)}"
|
| 680 |
-
return
|
| 681 |
-
messages[1]["content"].append({"type": "text", "text": question})
|
| 682 |
-
inputs = gemma3_processor.apply_chat_template(
|
| 683 |
-
messages, add_generation_prompt=True, tokenize=True,
|
| 684 |
-
return_dict=True, return_tensors="pt"
|
| 685 |
-
).to(gemma3_model.device, dtype=torch.bfloat16)
|
| 686 |
-
input_len = inputs["input_ids"].shape[-1]
|
| 687 |
-
streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
|
| 688 |
-
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
|
| 689 |
-
thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
|
| 690 |
-
thread.start()
|
| 691 |
-
buffer = ""
|
| 692 |
-
yield progress_bar_html("Processing Gemma3-4b Multimodal")
|
| 693 |
-
for new_text in streamer:
|
| 694 |
-
buffer += new_text
|
| 695 |
-
time.sleep(0.01)
|
| 696 |
-
yield buffer
|
| 697 |
-
return
|
| 698 |
-
|
| 699 |
# --- Text and TTS branch ---
|
| 700 |
tts_prefix = "@tts"
|
| 701 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -785,10 +733,10 @@ demo = gr.ChatInterface(
|
|
| 785 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 786 |
],
|
| 787 |
examples=[
|
| 788 |
-
[{"text": "@gemma3-4b Explain the Image", "files": ["examples/3.jpg"]}],
|
| 789 |
-
[{"text": "@gemma3-4b Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
|
| 790 |
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
| 791 |
[{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
|
|
|
|
|
|
|
| 792 |
["@image Chocolate dripping from a donut"],
|
| 793 |
["@3d A birthday cupcake with cherry"],
|
| 794 |
["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
|
|
@@ -809,7 +757,7 @@ demo = gr.ChatInterface(
|
|
| 809 |
label="Query Input",
|
| 810 |
file_types=["image", "audio"],
|
| 811 |
file_count="multiple",
|
| 812 |
-
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @
|
| 813 |
),
|
| 814 |
stop_btn="Stop Generation",
|
| 815 |
multimodal=True,
|
|
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
|
|
|
| 40 |
# Global constants and helper functions
|
| 41 |
|
| 42 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 323 |
torch_dtype=torch.float16
|
| 324 |
).to("cuda").eval()
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# Asynchronous text-to-speech
|
| 327 |
|
| 328 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
|
| 464 |
|
| 465 |
return Image.fromarray(annotated_image)
|
| 466 |
|
| 467 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
|
| 468 |
|
| 469 |
@spaces.GPU
|
| 470 |
def generate(
|
|
|
|
| 484 |
- "@web": triggers a web search or webpage visit.
|
| 485 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 486 |
- "@yolo": triggers object detection using YOLO.
|
| 487 |
+
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
|
|
|
|
| 488 |
"""
|
| 489 |
text = input_dict["text"]
|
| 490 |
files = input_dict.get("files", [])
|
|
|
|
| 644 |
yield buffer
|
| 645 |
return
|
| 646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
# --- Text and TTS branch ---
|
| 648 |
tts_prefix = "@tts"
|
| 649 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 733 |
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
|
| 734 |
],
|
| 735 |
examples=[
|
|
|
|
|
|
|
| 736 |
[{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
|
| 737 |
[{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
|
| 738 |
+
[{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
|
| 739 |
+
[{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
|
| 740 |
["@image Chocolate dripping from a donut"],
|
| 741 |
["@3d A birthday cupcake with cherry"],
|
| 742 |
["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
|
|
|
|
| 757 |
label="Query Input",
|
| 758 |
file_types=["image", "audio"],
|
| 759 |
file_count="multiple",
|
| 760 |
+
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
|
| 761 |
),
|
| 762 |
stop_btn="Stop Generation",
|
| 763 |
multimodal=True,
|