import gradio as gr import torch import os import numpy as np from groq import Groq import spaces from transformers import AutoModel, AutoTokenizer from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler from parler_tts import ParlerTTSForConditionalGeneration import soundfile as sf from llama_index.core.agent import ReActAgent from llama_index.core.tools import FunctionTool from llama_index.llms.groq import Groq from PIL import Image from tavily import TavilyClient import requests from huggingface_hub import hf_hub_download from safetensors.torch import load_file # Initialize models and clients MODEL = 'llama3-groq-70b-8192-tool-use-preview' client = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY")) vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True) tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1") tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") # Image generation model base = "stabilityai/stable-diffusion-xl-base-1.0" repo = "ByteDance/SDXL-Lightning" ckpt = "sdxl_lightning_4step_unet.safetensors" unet = UNet2DConditionModel.from_config(base, subfolder="unet") unet.load_state_dict(load_file(hf_hub_download(repo, ckpt))) image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16") image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing") # Tavily Client for web search tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API")) # Function to play voice output def play_voice_output(response): description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise." input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda') prompt_input_ids = tts_tokenizer(response, return_tensors="pt").input_ids.to('cuda') generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) audio_arr = generation.cpu().numpy().squeeze() sf.write("output.wav", audio_arr, tts_model.config.sampling_rate) return "output.wav" # NumPy Code Calculator Tool def numpy_code_calculator(query): try: llm_response = client.chat.completions.create( model=MODEL, messages=[ {"role": "user", "content": f"Write NumPy code to: {query}"} ] ) code = llm_response.choices[0].message.content print(f"Generated NumPy code:\n{code}") # Execute the code in a safe environment local_dict = {"np": np} exec(code, local_dict) result = local_dict.get("result", "No result found") return str(result) except Exception as e: return f"Error: {e}" # Web Search Tool def web_search(query): answer = tavily_client.qna_search(query=query) return answer # Image Generation Tool def image_generation(query): image = image_pipe(prompt=query, num_inference_steps=20, guidance_scale=7.5).images[0] image.save("output.jpg") return "output.jpg" # Function to handle different input types and choose the right tool def handle_input(user_prompt, image=None, audio=None, websearch=False): if audio: if isinstance(audio, str): audio = open(audio, "rb") transcription = client.audio.transcriptions.create( file=(audio.name, audio.read()), model="whisper-large-v3" ) user_prompt = transcription.text tools = [ FunctionTool.from_defaults(fn=numpy_code_calculator, name="Numpy Code Calculator"), FunctionTool.from_defaults(fn=web_search, name="Web Search"), FunctionTool.from_defaults(fn=image_generation, name="Image Generation"), ] llm = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY")) agent = ReActAgent.from_tools(tools, llm=llm, verbose=True) if image: image = Image.open(image).convert('RGB') messages = [{"role": "user", "content": [image, user_prompt]}] response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer) return response if websearch: response = agent.chat(f"{user_prompt} Use the Web Search tool if necessary.") else: response = agent.chat(user_prompt) return response # Gradio UI Setup def create_ui(): with gr.Blocks() as demo: gr.Markdown("# AI Assistant") with gr.Row(): with gr.Column(scale=2): user_prompt = gr.Textbox(placeholder="Type your message here...", lines=1) with gr.Column(scale=1): image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon") audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon") voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode") websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode") with gr.Column(scale=1): submit = gr.Button("Submit") output_label = gr.Label(label="Output") audio_output = gr.Audio(label="Audio Output", visible=False) submit.click( fn=main_interface, inputs=[user_prompt, image_input, audio_input, voice_only_mode, websearch_mode], outputs=[output_label, audio_output] ) voice_only_mode.change( lambda x: gr.update(visible=not x), inputs=voice_only_mode, outputs=[user_prompt, image_input, websearch_mode, submit] ) voice_only_mode.change( lambda x: gr.update(visible=x), inputs=voice_only_mode, outputs=[audio_input] ) return demo # Main interface function @spaces.GPU() def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False): vqa_model.to(device='cuda', dtype=torch.bfloat16) tts_model.to("cuda") unet.to("cuda") image_pipe.to("cuda") response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch) if voice_only: audio_output = play_voice_output(response) return "Response generated.", audio_output else: return response, None # Launch the UI demo = create_ui() demo.launch()