prithivMLmods commited on
Commit
3541fa7
·
verified ·
1 Parent(s): 3c3acfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -9,6 +9,7 @@ from threading import Thread
9
  import base64
10
  import shutil
11
  import re
 
12
 
13
  import gradio as gr
14
  import spaces
@@ -38,8 +39,8 @@ from diffusers.utils import export_to_ply
38
  # Additional import for Phi-4 multimodality (audio support)
39
  import soundfile as sf
40
 
41
-
42
  os.system('pip install backoff')
 
43
  # Global constants and helper functions
44
 
45
  MAX_SEED = np.iinfo(np.int32).max
@@ -59,6 +60,17 @@ def glb_to_data_url(glb_path: str) -> str:
59
  b64_data = base64.b64encode(data).decode("utf-8")
60
  return f"data:model/gltf-binary;base64,{b64_data}"
61
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Model class for Text-to-3D Generation (ShapE)
63
 
64
  class Model:
@@ -458,11 +470,13 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
458
 
459
  if input_type.lower() == "image":
460
  prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
461
- image = Image.open(file)
 
462
  inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
463
  elif input_type.lower() == "audio":
464
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
465
- audio, samplerate = sf.read(file)
 
466
  inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
467
  else:
468
  yield "Invalid input type selected."
@@ -719,7 +733,7 @@ demo = gr.ChatInterface(
719
  description=DESCRIPTION,
720
  css=css,
721
  fill_height=True,
722
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
723
  stop_btn="Stop Generation",
724
  multimodal=True,
725
  )
 
9
  import base64
10
  import shutil
11
  import re
12
+ from io import BytesIO
13
 
14
  import gradio as gr
15
  import spaces
 
39
  # Additional import for Phi-4 multimodality (audio support)
40
  import soundfile as sf
41
 
 
42
  os.system('pip install backoff')
43
+
44
  # Global constants and helper functions
45
 
46
  MAX_SEED = np.iinfo(np.int32).max
 
60
  b64_data = base64.b64encode(data).decode("utf-8")
61
  return f"data:model/gltf-binary;base64,{b64_data}"
62
 
63
+ def load_audio_file(file):
64
+ """
65
+ Loads an audio file. If file is a string path, it reads directly.
66
+ Otherwise, it assumes file is a file-like object.
67
+ """
68
+ if isinstance(file, str):
69
+ audio, samplerate = sf.read(file)
70
+ else:
71
+ audio, samplerate = sf.read(BytesIO(file.read()))
72
+ return audio, samplerate
73
+
74
  # Model class for Text-to-3D Generation (ShapE)
75
 
76
  class Model:
 
470
 
471
  if input_type.lower() == "image":
472
  prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
473
+ # Use load_image (as in Qwen2-VL-OCR-2B-Instruct) to handle image file input
474
+ image = load_image(file)
475
  inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
476
  elif input_type.lower() == "audio":
477
  prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
478
+ # Use load_audio_file to handle audio file input
479
+ audio, samplerate = load_audio_file(file)
480
  inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
481
  else:
482
  yield "Invalid input type selected."
 
733
  description=DESCRIPTION,
734
  css=css,
735
  fill_height=True,
736
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4 - audio, image, or plain text"),
737
  stop_btn="Stop Generation",
738
  multimodal=True,
739
  )