prithivMLmods commited on
Commit
035efc4
Β·
verified Β·
1 Parent(s): ace15c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -166
app.py CHANGED
@@ -39,9 +39,14 @@ from diffusers.utils import export_to_ply
39
  # Additional import for Phi-4 multimodality (audio support)
40
  import soundfile as sf
41
 
 
42
  os.system('pip install backoff')
43
 
44
- # Global constants and helper functions
 
 
 
 
45
 
46
  MAX_SEED = np.iinfo(np.int32).max
47
 
@@ -53,35 +58,30 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
53
  def glb_to_data_url(glb_path: str) -> str:
54
  """
55
  Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
56
- (Not used in this method.)
57
  """
58
  with open(glb_path, "rb") as f:
59
  data = f.read()
60
  b64_data = base64.b64encode(data).decode("utf-8")
61
  return f"data:model/gltf-binary;base64,{b64_data}"
62
 
63
- def get_file_path(file):
64
  """
65
- Normalize a file input. If the input is a string, assume it is a file path.
66
- Otherwise, if the object has a 'name' attribute or key, return that.
67
  """
68
  if isinstance(file, str):
69
- return file
70
- elif hasattr(file, "name"):
71
- return file.name
72
- elif isinstance(file, dict) and "name" in file:
73
- return file["name"]
74
  else:
75
- return None
 
76
 
77
- # Model class for Text-to-3D Generation (ShapE)
78
 
79
  class Model:
80
  def __init__(self):
81
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
  self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
83
  self.pipe.to(self.device)
84
- # Ensure the text encoder is in half precision to avoid dtype mismatches.
85
  if torch.cuda.is_available():
86
  try:
87
  self.pipe.text_encoder = self.pipe.text_encoder.half()
@@ -90,7 +90,6 @@ class Model:
90
 
91
  self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
92
  self.pipe_img.to(self.device)
93
- # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
94
  if torch.cuda.is_available():
95
  text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
96
  if text_encoder_img is not None:
@@ -98,7 +97,6 @@ class Model:
98
 
99
  def to_glb(self, ply_path: str) -> str:
100
  mesh = trimesh.load(ply_path)
101
- # Rotate the mesh for proper orientation
102
  rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
103
  mesh.apply_transform(rot)
104
  rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
@@ -133,7 +131,7 @@ class Model:
133
  export_to_ply(images[0], ply_path.name)
134
  return self.to_glb(ply_path.name)
135
 
136
- # New Tools for Web Functionality using DuckDuckGo and smolagents
137
 
138
  from typing import Any, Optional
139
  from smolagents.tools import Tool
@@ -141,7 +139,7 @@ import duckduckgo_search
141
 
142
  class DuckDuckGoSearchTool(Tool):
143
  name = "web_search"
144
- description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
145
  inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
146
  output_type = "string"
147
 
@@ -151,24 +149,20 @@ class DuckDuckGoSearchTool(Tool):
151
  try:
152
  from duckduckgo_search import DDGS
153
  except ImportError as e:
154
- raise ImportError(
155
- "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
156
- ) from e
157
  self.ddgs = DDGS(**kwargs)
158
 
159
  def forward(self, query: str) -> str:
160
  results = self.ddgs.text(query, max_results=self.max_results)
161
  if len(results) == 0:
162
- raise Exception("No results found! Try a less restrictive/shorter query.")
163
- postprocessed_results = [
164
- f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
165
- ]
166
  return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
167
 
168
  class VisitWebpageTool(Tool):
169
  name = "visit_webpage"
170
- description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
171
- inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
172
  output_type = "string"
173
 
174
  def __init__(self, *args, **kwargs):
@@ -179,33 +173,23 @@ class VisitWebpageTool(Tool):
179
  import requests
180
  from markdownify import markdownify
181
  from requests.exceptions import RequestException
182
-
183
  from smolagents.utils import truncate_content
184
  except ImportError as e:
185
- raise ImportError(
186
- "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
187
- ) from e
188
  try:
189
- # Send a GET request to the URL with a 20-second timeout
190
  response = requests.get(url, timeout=20)
191
- response.raise_for_status() # Raise an exception for bad status codes
192
-
193
- # Convert the HTML content to Markdown
194
  markdown_content = markdownify(response.text).strip()
195
-
196
- # Remove multiple line breaks
197
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
198
-
199
  return truncate_content(markdown_content, 10000)
200
-
201
  except requests.exceptions.Timeout:
202
- return "The request timed out. Please try again later or check the URL."
203
  except RequestException as e:
204
  return f"Error fetching the webpage: {str(e)}"
205
  except Exception as e:
206
- return f"An unexpected error occurred: {str(e)}"
207
-
208
- # rAgent Reasoning using Llama mode OpenAI
209
 
210
  from openai import OpenAI
211
 
@@ -216,22 +200,17 @@ ragent_client = OpenAI(
216
  )
217
 
218
  SYSTEM_PROMPT = """
219
-
220
- "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
221
- "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
222
- "2. **Code**: Write Python code to implement your solution.\n"
223
- "3. **Observation**: Analyze the output of the code and summarize the results.\n"
224
- "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
225
- f"Task: {task}"
226
-
227
  """
228
 
229
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
230
- """
231
- Uses the Llama mode OpenAI model to perform a structured reasoning chain.
232
- """
233
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
234
- # Incorporate conversation history (if any)
235
  for msg in history:
236
  if msg.get("role") == "user":
237
  messages.append({"role": "user", "content": msg["content"]})
@@ -252,17 +231,17 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
252
  response += token
253
  yield response
254
 
255
- # Gradio UI configuration
256
 
257
  DESCRIPTION = """
258
- # Agent Dino 🌠 """
 
259
 
260
  css = '''
261
  h1 {
262
  text-align: center;
263
  display: block;
264
  }
265
-
266
  #duplicate-button {
267
  margin: auto;
268
  color: #fff;
@@ -277,9 +256,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
277
 
278
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
279
 
280
- # Load Models and Pipelines for Chat, Image, and Multimodal Processing
281
- # Load the text-only model and tokenizer (for pure text chat)
282
-
283
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
284
  tokenizer = AutoTokenizer.from_pretrained(model_id)
285
  model = AutoModelForCausalLM.from_pretrained(
@@ -289,13 +266,11 @@ model = AutoModelForCausalLM.from_pretrained(
289
  )
290
  model.eval()
291
 
292
- # Voices for text-to-speech
293
  TTS_VOICES = [
294
- "en-US-JennyNeural", # @tts1
295
- "en-US-GuyNeural", # @tts2
296
  ]
297
 
298
- # Load multimodal processor and model (e.g. for OCR and image processing)
299
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
300
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
301
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -304,35 +279,23 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
304
  torch_dtype=torch.float16
305
  ).to("cuda").eval()
306
 
307
- # Asynchronous text-to-speech
308
-
309
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
310
- """Convert text to speech using Edge TTS and save as MP3"""
311
  communicate = edge_tts.Communicate(text, voice)
312
  await communicate.save(output_file)
313
  return output_file
314
 
315
- # Utility function to clean conversation history
316
-
317
  def clean_chat_history(chat_history):
318
- """
319
- Filter out any chat entries whose "content" is not a string.
320
- This helps prevent errors when concatenating previous messages.
321
- """
322
  cleaned = []
323
  for msg in chat_history:
324
  if isinstance(msg, dict) and isinstance(msg.get("content"), str):
325
  cleaned.append(msg)
326
  return cleaned
327
 
328
- # Stable Diffusion XL Pipeline for Image Generation
329
- #Model In Use : SG161222/RealVisXL_V5.0_Lightning
330
-
331
- MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
332
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
333
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
334
  ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
335
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1")) # For batched image generation
336
 
337
  sd_pipe = StableDiffusionXLPipeline.from_pretrained(
338
  MODEL_ID_SD,
@@ -341,18 +304,14 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
341
  add_watermarker=False,
342
  ).to(device)
343
  sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
344
-
345
  if torch.cuda.is_available():
346
  sd_pipe.text_encoder = sd_pipe.text_encoder.half()
347
-
348
  if USE_TORCH_COMPILE:
349
  sd_pipe.compile()
350
-
351
  if ENABLE_CPU_OFFLOAD:
352
  sd_pipe.enable_model_cpu_offload()
353
 
354
  def save_image(img: Image.Image) -> str:
355
- """Save a PIL image with a unique filename and return the path."""
356
  unique_name = str(uuid.uuid4()) + ".png"
357
  img.save(unique_name)
358
  return unique_name
@@ -372,10 +331,8 @@ def generate_image_fn(
372
  num_images: int = 1,
373
  progress=gr.Progress(track_tqdm=True),
374
  ):
375
- """Generate images using the SDXL pipeline."""
376
  seed = int(randomize_seed_fn(seed, randomize_seed))
377
  generator = torch.Generator(device=device).manual_seed(seed)
378
-
379
  options = {
380
  "prompt": [prompt] * num_images,
381
  "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
@@ -388,9 +345,7 @@ def generate_image_fn(
388
  }
389
  if use_resolution_binning:
390
  options["use_resolution_binning"] = True
391
-
392
  images = []
393
- # Process in batches
394
  for i in range(0, num_images, BATCH_SIZE):
395
  batch_options = options.copy()
396
  batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
@@ -405,8 +360,6 @@ def generate_image_fn(
405
  image_paths = [save_image(img) for img in images]
406
  return image_paths, seed
407
 
408
- # Text-to-3D Generation using the ShapE Pipeline
409
-
410
  @spaces.GPU(duration=120, enable_queue=True)
411
  def generate_3d_fn(
412
  prompt: str,
@@ -415,39 +368,28 @@ def generate_3d_fn(
415
  num_steps: int = 64,
416
  randomize_seed: bool = False,
417
  ):
418
- """
419
- Generate a 3D model from text using the ShapE pipeline.
420
- Returns a tuple of (glb_file_path, used_seed).
421
- """
422
  seed = int(randomize_seed_fn(seed, randomize_seed))
423
  model3d = Model()
424
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
425
  return glb_path, seed
426
 
427
- # YOLO Object Detection Setup
428
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
429
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
430
  yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
431
  yolo_detector = YOLODetector(yolo_model_path)
432
 
433
  def detect_objects(image: np.ndarray):
434
- """Runs object detection on the input image."""
435
  results = yolo_detector(image, verbose=False)[0]
436
  detections = sv.Detections.from_ultralytics(results).with_nms()
437
-
438
  box_annotator = sv.BoxAnnotator()
439
  label_annotator = sv.LabelAnnotator()
440
-
441
  annotated_image = image.copy()
442
  annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
443
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
444
-
445
  return Image.fromarray(annotated_image)
446
 
447
- # Phi-4 Multimodal Model Setup with Text Streaming
448
-
449
  phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
450
-
451
  phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
452
  phi4_model = AutoModelForCausalLM.from_pretrained(
453
  phi4_model_path,
@@ -457,11 +399,10 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
457
  _attn_implementation="eager",
458
  )
459
 
460
- def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
461
  """
462
  Process an image or audio input with the Phi-4 multimodal model.
463
- Uses a text streamer to yield incremental outputs.
464
- Expects input_type to be either 'image' or 'audio'.
465
  """
466
  user_prompt = '<|user|>'
467
  assistant_prompt = '<|assistant|>'
@@ -471,24 +412,22 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
471
  yield "Please upload a file and provide a question."
472
  return
473
 
474
- file_path = get_file_path(file)
475
- if file_path is None:
476
- yield "Could not determine the file path."
477
- return
478
-
479
- if input_type.lower() == "image":
480
- prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
481
- image = Image.open(file_path)
482
- inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
483
- elif input_type.lower() == "audio":
484
- prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
485
- audio, samplerate = sf.read(file_path)
486
- inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
487
- else:
488
- yield "Invalid input type selected."
489
  return
490
 
491
- # Setup text streamer using TextIteratorStreamer for incremental generation
492
  streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
493
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
494
  thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
@@ -501,8 +440,6 @@ def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200
501
  time.sleep(0.01)
502
  yield buffer
503
 
504
- # Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
505
-
506
  @spaces.GPU
507
  def generate(
508
  input_dict: dict,
@@ -514,19 +451,54 @@ def generate(
514
  repetition_penalty: float = 1.2,
515
  ):
516
  """
517
- Generates chatbot responses with support for multimodal input and special commands:
518
- - "@tts1" or "@tts2": triggers text-to-speech.
519
- - "@image": triggers image generation using the SDXL pipeline.
520
- - "@3d": triggers 3D model generation using the ShapE pipeline.
521
- - "@web": triggers a web search or webpage visit.
522
- - "@ragent": initiates a reasoning chain using Llama mode.
523
- - "@yolo": triggers object detection using YOLO.
524
- - **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
 
525
  """
526
  text = input_dict["text"]
527
  files = input_dict.get("files", [])
528
 
529
- # --- 3D Generation branch ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  if text.strip().lower().startswith("@3d"):
531
  prompt = text[len("@3d"):].strip()
532
  yield "πŸŒ€ Hold tight, generating a 3D mesh GLB file....."
@@ -537,18 +509,15 @@ def generate(
537
  num_steps=64,
538
  randomize_seed=True,
539
  )
540
- # Copy the GLB file to a static folder.
541
  static_folder = os.path.join(os.getcwd(), "static")
542
  if not os.path.exists(static_folder):
543
  os.makedirs(static_folder)
544
  new_filename = f"mesh_{uuid.uuid4()}.glb"
545
  new_filepath = os.path.join(static_folder, new_filename)
546
  shutil.copy(glb_path, new_filepath)
547
-
548
  yield gr.File(new_filepath)
549
  return
550
 
551
- # --- Image Generation branch ---
552
  if text.strip().lower().startswith("@image"):
553
  prompt = text[len("@image"):].strip()
554
  yield "πŸͺ§ Generating image..."
@@ -568,7 +537,6 @@ def generate(
568
  yield gr.Image(image_paths[0])
569
  return
570
 
571
- # --- Web Search/Visit branch ---
572
  if text.strip().lower().startswith("@web"):
573
  web_command = text[len("@web"):].strip()
574
  if web_command.lower().startswith("visit"):
@@ -585,7 +553,6 @@ def generate(
585
  yield results
586
  return
587
 
588
- # --- rAgent Reasoning branch ---
589
  if text.strip().lower().startswith("@ragent"):
590
  prompt = text[len("@ragent"):].strip()
591
  yield "πŸ“ Initiating reasoning chain using Llama mode..."
@@ -593,7 +560,6 @@ def generate(
593
  yield partial
594
  return
595
 
596
- # --- YOLO Object Detection branch ---
597
  if text.strip().lower().startswith("@yolo"):
598
  yield "πŸ” Running object detection with YOLO..."
599
  if not files or len(files) == 0:
@@ -604,7 +570,7 @@ def generate(
604
  if isinstance(input_file, str):
605
  pil_image = Image.open(input_file)
606
  else:
607
- pil_image = Image.open(get_file_path(input_file))
608
  except Exception as e:
609
  yield f"Error loading image: {str(e)}"
610
  return
@@ -613,28 +579,9 @@ def generate(
613
  yield gr.Image(result_img)
614
  return
615
 
616
- # --- Phi-4 Multimodal branch with text streaming ---
617
- if text.strip().lower().startswith("@phi4"):
618
- parts = text.strip().split(maxsplit=2)
619
- if len(parts) < 3:
620
- yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
621
- return
622
- input_type = parts[1]
623
- question = parts[2]
624
- if not files or len(files) == 0:
625
- yield "Error: Please attach an image or audio file for Phi-4 processing."
626
- return
627
- file_input = files[0]
628
- yield "πŸ”„ Processing multimodal input with Phi-4..."
629
- for partial in process_phi4(input_type, file_input, question):
630
- yield partial
631
- return
632
-
633
- # --- Text and TTS branch ---
634
  tts_prefix = "@tts"
635
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
636
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
637
-
638
  if is_tts and voice_index:
639
  voice = TTS_VOICES[voice_index - 1]
640
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -644,12 +591,11 @@ def generate(
644
  text = text.replace(tts_prefix, "").strip()
645
  conversation = clean_chat_history(chat_history)
646
  conversation.append({"role": "user", "content": text})
647
-
648
  if files:
649
  if len(files) > 1:
650
- images = [load_image(get_file_path(image)) for image in files]
651
  elif len(files) == 1:
652
- images = [load_image(get_file_path(files[0]))]
653
  else:
654
  images = []
655
  messages = [{
@@ -665,7 +611,6 @@ def generate(
665
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
666
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
667
  thread.start()
668
-
669
  buffer = ""
670
  yield "πŸ€” Thinking..."
671
  for new_text in streamer:
@@ -693,21 +638,16 @@ def generate(
693
  }
694
  t = Thread(target=model.generate, kwargs=generation_kwargs)
695
  t.start()
696
-
697
  outputs = []
698
  for new_text in streamer:
699
  outputs.append(new_text)
700
  yield "".join(outputs)
701
-
702
  final_response = "".join(outputs)
703
  yield final_response
704
-
705
  if is_tts and voice:
706
  output_file = asyncio.run(text_to_speech(final_response, voice))
707
  yield gr.Audio(output_file, autoplay=True)
708
 
709
- # Gradio Chat Interface Setup and Launch
710
-
711
  demo = gr.ChatInterface(
712
  fn=generate,
713
  additional_inputs=[
@@ -739,7 +679,6 @@ demo = gr.ChatInterface(
739
  multimodal=True,
740
  )
741
 
742
- # Ensure the static folder exists
743
  if not os.path.exists("static"):
744
  os.makedirs("static")
745
 
 
39
  # Additional import for Phi-4 multimodality (audio support)
40
  import soundfile as sf
41
 
42
+ # Install additional dependencies if needed
43
  os.system('pip install backoff')
44
 
45
+ # --- File validation constants ---
46
+ IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
47
+ AUDIO_EXTENSIONS = ['.wav', '.mp3', '.flac', '.ogg']
48
+
49
+ # --- Global constants and helper functions ---
50
 
51
  MAX_SEED = np.iinfo(np.int32).max
52
 
 
58
  def glb_to_data_url(glb_path: str) -> str:
59
  """
60
  Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
 
61
  """
62
  with open(glb_path, "rb") as f:
63
  data = f.read()
64
  b64_data = base64.b64encode(data).decode("utf-8")
65
  return f"data:model/gltf-binary;base64,{b64_data}"
66
 
67
+ def load_audio_file(file):
68
  """
69
+ Loads an audio file. If file is a string path, it reads directly.
70
+ Otherwise, assumes file is a file-like object.
71
  """
72
  if isinstance(file, str):
73
+ audio, samplerate = sf.read(file)
 
 
 
 
74
  else:
75
+ audio, samplerate = sf.read(BytesIO(file.read()))
76
+ return audio, samplerate
77
 
78
+ # --- Model class for Text-to-3D Generation (ShapE) ---
79
 
80
  class Model:
81
  def __init__(self):
82
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83
  self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
84
  self.pipe.to(self.device)
 
85
  if torch.cuda.is_available():
86
  try:
87
  self.pipe.text_encoder = self.pipe.text_encoder.half()
 
90
 
91
  self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
92
  self.pipe_img.to(self.device)
 
93
  if torch.cuda.is_available():
94
  text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
95
  if text_encoder_img is not None:
 
97
 
98
  def to_glb(self, ply_path: str) -> str:
99
  mesh = trimesh.load(ply_path)
 
100
  rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
101
  mesh.apply_transform(rot)
102
  rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
 
131
  export_to_ply(images[0], ply_path.name)
132
  return self.to_glb(ply_path.name)
133
 
134
+ # --- New Tools for Web Functionality using DuckDuckGo and smolagents ---
135
 
136
  from typing import Any, Optional
137
  from smolagents.tools import Tool
 
139
 
140
  class DuckDuckGoSearchTool(Tool):
141
  name = "web_search"
142
+ description = "Performs a duckduckgo web search based on your query then returns the top search results."
143
  inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
144
  output_type = "string"
145
 
 
149
  try:
150
  from duckduckgo_search import DDGS
151
  except ImportError as e:
152
+ raise ImportError("Install duckduckgo-search via pip.") from e
 
 
153
  self.ddgs = DDGS(**kwargs)
154
 
155
  def forward(self, query: str) -> str:
156
  results = self.ddgs.text(query, max_results=self.max_results)
157
  if len(results) == 0:
158
+ raise Exception("No results found! Try a less restrictive query.")
159
+ postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
 
 
160
  return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
161
 
162
  class VisitWebpageTool(Tool):
163
  name = "visit_webpage"
164
+ description = "Visits a webpage at the given URL and returns its content as markdown."
165
+ inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to visit.'}}
166
  output_type = "string"
167
 
168
  def __init__(self, *args, **kwargs):
 
173
  import requests
174
  from markdownify import markdownify
175
  from requests.exceptions import RequestException
 
176
  from smolagents.utils import truncate_content
177
  except ImportError as e:
178
+ raise ImportError("Install markdownify and requests via pip.") from e
 
 
179
  try:
 
180
  response = requests.get(url, timeout=20)
181
+ response.raise_for_status()
 
 
182
  markdown_content = markdownify(response.text).strip()
 
 
183
  markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 
184
  return truncate_content(markdown_content, 10000)
 
185
  except requests.exceptions.Timeout:
186
+ return "The request timed out. Please try again later."
187
  except RequestException as e:
188
  return f"Error fetching the webpage: {str(e)}"
189
  except Exception as e:
190
+ return f"Unexpected error: {str(e)}"
191
+
192
+ # --- rAgent Reasoning using Llama mode OpenAI ---
193
 
194
  from openai import OpenAI
195
 
 
200
  )
201
 
202
  SYSTEM_PROMPT = """
203
+ "You are an expert assistant who solves tasks using Python code. Follow these steps:
204
+ 1. Thought: Explain your reasoning and plan.
205
+ 2. Code: Write Python code to implement your solution.
206
+ 3. Observation: Analyze the output.
207
+ 4. Final Answer: Provide a concise conclusion.
208
+
209
+ Task: {task}"
 
210
  """
211
 
212
  def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
 
 
 
213
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
 
214
  for msg in history:
215
  if msg.get("role") == "user":
216
  messages.append({"role": "user", "content": msg["content"]})
 
231
  response += token
232
  yield response
233
 
234
+ # --- Gradio UI configuration ---
235
 
236
  DESCRIPTION = """
237
+ # Agent Dino 🌠
238
+ """
239
 
240
  css = '''
241
  h1 {
242
  text-align: center;
243
  display: block;
244
  }
 
245
  #duplicate-button {
246
  margin: auto;
247
  color: #fff;
 
256
 
257
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
258
 
259
+ # --- Load Models and Pipelines for Chat, Image, and Multimodal Processing ---
 
 
260
  model_id = "prithivMLmods/FastThink-0.5B-Tiny"
261
  tokenizer = AutoTokenizer.from_pretrained(model_id)
262
  model = AutoModelForCausalLM.from_pretrained(
 
266
  )
267
  model.eval()
268
 
 
269
  TTS_VOICES = [
270
+ "en-US-JennyNeural",
271
+ "en-US-GuyNeural",
272
  ]
273
 
 
274
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
275
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
276
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 
279
  torch_dtype=torch.float16
280
  ).to("cuda").eval()
281
 
 
 
282
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 
283
  communicate = edge_tts.Communicate(text, voice)
284
  await communicate.save(output_file)
285
  return output_file
286
 
 
 
287
  def clean_chat_history(chat_history):
 
 
 
 
288
  cleaned = []
289
  for msg in chat_history:
290
  if isinstance(msg, dict) and isinstance(msg.get("content"), str):
291
  cleaned.append(msg)
292
  return cleaned
293
 
294
+ MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 
 
 
295
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
296
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
297
  ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
298
+ BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
299
 
300
  sd_pipe = StableDiffusionXLPipeline.from_pretrained(
301
  MODEL_ID_SD,
 
304
  add_watermarker=False,
305
  ).to(device)
306
  sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 
307
  if torch.cuda.is_available():
308
  sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 
309
  if USE_TORCH_COMPILE:
310
  sd_pipe.compile()
 
311
  if ENABLE_CPU_OFFLOAD:
312
  sd_pipe.enable_model_cpu_offload()
313
 
314
  def save_image(img: Image.Image) -> str:
 
315
  unique_name = str(uuid.uuid4()) + ".png"
316
  img.save(unique_name)
317
  return unique_name
 
331
  num_images: int = 1,
332
  progress=gr.Progress(track_tqdm=True),
333
  ):
 
334
  seed = int(randomize_seed_fn(seed, randomize_seed))
335
  generator = torch.Generator(device=device).manual_seed(seed)
 
336
  options = {
337
  "prompt": [prompt] * num_images,
338
  "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
 
345
  }
346
  if use_resolution_binning:
347
  options["use_resolution_binning"] = True
 
348
  images = []
 
349
  for i in range(0, num_images, BATCH_SIZE):
350
  batch_options = options.copy()
351
  batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
 
360
  image_paths = [save_image(img) for img in images]
361
  return image_paths, seed
362
 
 
 
363
  @spaces.GPU(duration=120, enable_queue=True)
364
  def generate_3d_fn(
365
  prompt: str,
 
368
  num_steps: int = 64,
369
  randomize_seed: bool = False,
370
  ):
 
 
 
 
371
  seed = int(randomize_seed_fn(seed, randomize_seed))
372
  model3d = Model()
373
  glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
374
  return glb_path, seed
375
 
 
376
  YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
377
  YOLO_CHECKPOINT_NAME = "images/demo.pt"
378
  yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
379
  yolo_detector = YOLODetector(yolo_model_path)
380
 
381
  def detect_objects(image: np.ndarray):
 
382
  results = yolo_detector(image, verbose=False)[0]
383
  detections = sv.Detections.from_ultralytics(results).with_nms()
 
384
  box_annotator = sv.BoxAnnotator()
385
  label_annotator = sv.LabelAnnotator()
 
386
  annotated_image = image.copy()
387
  annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
388
  annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
 
389
  return Image.fromarray(annotated_image)
390
 
391
+ # --- Phi-4 Multimodal Model Setup with Text Streaming ---
 
392
  phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 
393
  phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
394
  phi4_model = AutoModelForCausalLM.from_pretrained(
395
  phi4_model_path,
 
399
  _attn_implementation="eager",
400
  )
401
 
402
+ def process_phi4(input_type: str, file: str, question: str, max_new_tokens: int = 200):
403
  """
404
  Process an image or audio input with the Phi-4 multimodal model.
405
+ Expects input_type to be either 'image' or 'audio' and file is a file path.
 
406
  """
407
  user_prompt = '<|user|>'
408
  assistant_prompt = '<|assistant|>'
 
412
  yield "Please upload a file and provide a question."
413
  return
414
 
415
+ try:
416
+ if input_type == "image":
417
+ prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
418
+ image = load_image(file)
419
+ inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
420
+ elif input_type == "audio":
421
+ prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
422
+ audio, samplerate = load_audio_file(file)
423
+ inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
424
+ else:
425
+ yield "Invalid input type selected. Use 'image' or 'audio'."
426
+ return
427
+ except Exception as e:
428
+ yield f"Error loading file: {str(e)}"
 
429
  return
430
 
 
431
  streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
432
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
433
  thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
 
440
  time.sleep(0.01)
441
  yield buffer
442
 
 
 
443
  @spaces.GPU
444
  def generate(
445
  input_dict: dict,
 
451
  repetition_penalty: float = 1.2,
452
  ):
453
  """
454
+ Generates chatbot responses with support for multimodal input and special commands.
455
+ Special commands include:
456
+ - "@tts1" or "@tts2": Text-to-speech.
457
+ - "@image": Image generation using the SDXL pipeline.
458
+ - "@3d": 3D model generation using the ShapE pipeline.
459
+ - "@web": Web search or webpage visit.
460
+ - "@ragent": Reasoning chain using Llama mode.
461
+ - "@yolo": Object detection using YOLO.
462
+ - "@phi4": Processes image or audio inputs with the Phi-4 model and streams text output.
463
  """
464
  text = input_dict["text"]
465
  files = input_dict.get("files", [])
466
 
467
+ # --- Phi-4 Multimodal branch with text streaming ---
468
+ if text.strip().lower().startswith("@phi4"):
469
+ parts = text.strip().split(maxsplit=2)
470
+ if len(parts) < 3:
471
+ yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
472
+ return
473
+ input_type = parts[1].lower()
474
+ question = parts[2]
475
+
476
+ if not files or len(files) == 0:
477
+ yield "Error: Please attach an image or audio file for Phi-4 processing."
478
+ return
479
+
480
+ if len(files) > 1:
481
+ yield "Warning: Multiple files attached. Only the first file will be processed."
482
+
483
+ file_input = files[0] # This is a string path from gr.MultimodalTextbox
484
+
485
+ extension = os.path.splitext(file_input)[1].lower()
486
+ if input_type == "image" and extension not in IMAGE_EXTENSIONS:
487
+ yield f"Error: Attached file is not an image. Expected extensions: {', '.join(IMAGE_EXTENSIONS)}"
488
+ return
489
+ elif input_type == "audio" and extension not in AUDIO_EXTENSIONS:
490
+ yield f"Error: Attached file is not an audio file. Expected extensions: {', '.join(AUDIO_EXTENSIONS)}"
491
+ return
492
+
493
+ yield "πŸ”„ Processing multimodal input with Phi-4..."
494
+ try:
495
+ for partial in process_phi4(input_type, file_input, question):
496
+ yield partial
497
+ except Exception as e:
498
+ yield f"Error processing file: {str(e)}"
499
+ return
500
+
501
+ # --- Other branches remain unchanged ---
502
  if text.strip().lower().startswith("@3d"):
503
  prompt = text[len("@3d"):].strip()
504
  yield "πŸŒ€ Hold tight, generating a 3D mesh GLB file....."
 
509
  num_steps=64,
510
  randomize_seed=True,
511
  )
 
512
  static_folder = os.path.join(os.getcwd(), "static")
513
  if not os.path.exists(static_folder):
514
  os.makedirs(static_folder)
515
  new_filename = f"mesh_{uuid.uuid4()}.glb"
516
  new_filepath = os.path.join(static_folder, new_filename)
517
  shutil.copy(glb_path, new_filepath)
 
518
  yield gr.File(new_filepath)
519
  return
520
 
 
521
  if text.strip().lower().startswith("@image"):
522
  prompt = text[len("@image"):].strip()
523
  yield "πŸͺ§ Generating image..."
 
537
  yield gr.Image(image_paths[0])
538
  return
539
 
 
540
  if text.strip().lower().startswith("@web"):
541
  web_command = text[len("@web"):].strip()
542
  if web_command.lower().startswith("visit"):
 
553
  yield results
554
  return
555
 
 
556
  if text.strip().lower().startswith("@ragent"):
557
  prompt = text[len("@ragent"):].strip()
558
  yield "πŸ“ Initiating reasoning chain using Llama mode..."
 
560
  yield partial
561
  return
562
 
 
563
  if text.strip().lower().startswith("@yolo"):
564
  yield "πŸ” Running object detection with YOLO..."
565
  if not files or len(files) == 0:
 
570
  if isinstance(input_file, str):
571
  pil_image = Image.open(input_file)
572
  else:
573
+ pil_image = Image.open(input_file)
574
  except Exception as e:
575
  yield f"Error loading image: {str(e)}"
576
  return
 
579
  yield gr.Image(result_img)
580
  return
581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
  tts_prefix = "@tts"
583
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
584
  voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
 
585
  if is_tts and voice_index:
586
  voice = TTS_VOICES[voice_index - 1]
587
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
 
591
  text = text.replace(tts_prefix, "").strip()
592
  conversation = clean_chat_history(chat_history)
593
  conversation.append({"role": "user", "content": text})
 
594
  if files:
595
  if len(files) > 1:
596
+ images = [load_image(file) for file in files]
597
  elif len(files) == 1:
598
+ images = [load_image(files[0])]
599
  else:
600
  images = []
601
  messages = [{
 
611
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
612
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
613
  thread.start()
 
614
  buffer = ""
615
  yield "πŸ€” Thinking..."
616
  for new_text in streamer:
 
638
  }
639
  t = Thread(target=model.generate, kwargs=generation_kwargs)
640
  t.start()
 
641
  outputs = []
642
  for new_text in streamer:
643
  outputs.append(new_text)
644
  yield "".join(outputs)
 
645
  final_response = "".join(outputs)
646
  yield final_response
 
647
  if is_tts and voice:
648
  output_file = asyncio.run(text_to_speech(final_response, voice))
649
  yield gr.Audio(output_file, autoplay=True)
650
 
 
 
651
  demo = gr.ChatInterface(
652
  fn=generate,
653
  additional_inputs=[
 
679
  multimodal=True,
680
  )
681
 
 
682
  if not os.path.exists("static"):
683
  os.makedirs("static")
684