Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ from threading import Thread
|
|
9 |
import base64
|
10 |
import shutil
|
11 |
import re
|
|
|
12 |
|
13 |
import gradio as gr
|
14 |
import spaces
|
@@ -38,8 +39,8 @@ from diffusers.utils import export_to_ply
|
|
38 |
# Additional import for Phi-4 multimodality (audio support)
|
39 |
import soundfile as sf
|
40 |
|
41 |
-
|
42 |
os.system('pip install backoff')
|
|
|
43 |
# Global constants and helper functions
|
44 |
|
45 |
MAX_SEED = np.iinfo(np.int32).max
|
@@ -59,6 +60,17 @@ def glb_to_data_url(glb_path: str) -> str:
|
|
59 |
b64_data = base64.b64encode(data).decode("utf-8")
|
60 |
return f"data:model/gltf-binary;base64,{b64_data}"
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
# Model class for Text-to-3D Generation (ShapE)
|
63 |
|
64 |
class Model:
|
@@ -458,11 +470,13 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
|
|
458 |
|
459 |
if input_type.lower() == "image":
|
460 |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
461 |
-
|
|
|
462 |
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
463 |
elif input_type.lower() == "audio":
|
464 |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
465 |
-
|
|
|
466 |
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
467 |
else:
|
468 |
yield "Invalid input type selected."
|
@@ -719,7 +733,7 @@ demo = gr.ChatInterface(
|
|
719 |
description=DESCRIPTION,
|
720 |
css=css,
|
721 |
fill_height=True,
|
722 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
|
723 |
stop_btn="Stop Generation",
|
724 |
multimodal=True,
|
725 |
)
|
|
|
9 |
import base64
|
10 |
import shutil
|
11 |
import re
|
12 |
+
from io import BytesIO
|
13 |
|
14 |
import gradio as gr
|
15 |
import spaces
|
|
|
39 |
# Additional import for Phi-4 multimodality (audio support)
|
40 |
import soundfile as sf
|
41 |
|
|
|
42 |
os.system('pip install backoff')
|
43 |
+
|
44 |
# Global constants and helper functions
|
45 |
|
46 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
60 |
b64_data = base64.b64encode(data).decode("utf-8")
|
61 |
return f"data:model/gltf-binary;base64,{b64_data}"
|
62 |
|
63 |
+
def load_audio_file(file):
|
64 |
+
"""
|
65 |
+
Loads an audio file. If file is a string path, it reads directly.
|
66 |
+
Otherwise, it assumes file is a file-like object.
|
67 |
+
"""
|
68 |
+
if isinstance(file, str):
|
69 |
+
audio, samplerate = sf.read(file)
|
70 |
+
else:
|
71 |
+
audio, samplerate = sf.read(BytesIO(file.read()))
|
72 |
+
return audio, samplerate
|
73 |
+
|
74 |
# Model class for Text-to-3D Generation (ShapE)
|
75 |
|
76 |
class Model:
|
|
|
470 |
|
471 |
if input_type.lower() == "image":
|
472 |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
|
473 |
+
# Use load_image (as in Qwen2-VL-OCR-2B-Instruct) to handle image file input
|
474 |
+
image = load_image(file)
|
475 |
inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
|
476 |
elif input_type.lower() == "audio":
|
477 |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
|
478 |
+
# Use load_audio_file to handle audio file input
|
479 |
+
audio, samplerate = load_audio_file(file)
|
480 |
inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
|
481 |
else:
|
482 |
yield "Invalid input type selected."
|
|
|
733 |
description=DESCRIPTION,
|
734 |
css=css,
|
735 |
fill_height=True,
|
736 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4 - audio, image, or plain text"),
|
737 |
stop_btn="Stop Generation",
|
738 |
multimodal=True,
|
739 |
)
|