Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Runtime error

App Files Files Community

VanguardAI commited on Aug 14, 2024

Commit

1197e50

verified ·

1 Parent(s): f7d8d6b

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -82

app.py CHANGED Viewed

@@ -7,10 +7,12 @@ from transformers import AutoModel, AutoTokenizer
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
-from langchain_community.embeddings import OpenAIEmbeddings
-from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
 from PIL import Image
 from decord import VideoReader, cpu
 from tavily import TavilyClient
@@ -18,31 +20,31 @@ import requests
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
-# Initialize models
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
-text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
                                        device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
-# Corrected image model and pipeline setup
 base = "stabilityai/stable-diffusion-xl-base-1.0"
 repo = "ByteDance/SDXL-Lightning"
 ckpt = "sdxl_lightning_4step_unet.safetensors"
 unet = UNet2DConditionModel.from_config(base, subfolder="unet").to("cuda", torch.float16)
 unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
-image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16").to("cuda")
 image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
-# Tavily Client
-tavily_client = TavilyClient(api_key="tvly-YOUR_API_KEY")
-# Voice output function
 def play_voice_output(response):
     description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
@@ -52,50 +54,55 @@ def play_voice_output(response):
     sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
     return "output.wav"
-# NumPy Calculation function
-def numpy_calculate(code: str) -> str:
     try:
-        local_dict = {}
-        exec(code, {"np": np}, local_dict)
         result = local_dict.get("result", "No result found")
         return str(result)
     except Exception as e:
-        return f"An error occurred: {str(e)}"
-# Function to use Langchain for RAG
-def use_langchain_rag(file_name, file_content, query):
-    # Split the document into chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     docs = text_splitter.create_documents([file_content])
-    # Create embeddings and store in the vector database
     embeddings = OpenAIEmbeddings()
-    db = Chroma.from_documents(docs, embeddings, persist_directory=".chroma_db")  # Use a persistent directory
-    # Create a question-answering chain
     qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever())
-    # Get the answer
     return qa.run(query)
-# Function to encode video
-def encode_video(video_path):
-    MAX_NUM_FRAMES = 64
-    vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)
-    frame_idx = [i for i in range(0, len(vr), sample_fps)]
-    if len(frame_idx) > MAX_NUM_FRAMES:
-        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
-    frames = vr.get_batch(frame_idx).asnumpy()
-    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
-    return frames
-# Web search function
-def web_search(query):
-    answer = tavily_client.qna_search(query=query)
-    return answer
-# Function to handle different input types
 def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, websearch=False):
     # Voice input handling
     if audio:
@@ -105,50 +112,58 @@ def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, webs
         )
         user_prompt = transcription.text
     # If user uploaded an image and text, use MiniCPM model
     if image:
         image = Image.open(image).convert('RGB')
         messages = [{"role": "user", "content": [image, user_prompt]}]
-        response = text_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
         return response
-    # Determine which tool to use
-    if doc:
-        file_content = doc.read().decode('utf-8')
-        response = use_langchain_rag(doc.name, file_content, user_prompt)
-    elif "calculate" in user_prompt.lower():
-        response = numpy_calculate(user_prompt)
-    elif "generate" in user_prompt.lower() and ("image" in user_prompt.lower() or "picture" in user_prompt.lower()):
-        response = image_pipe(prompt=user_prompt, num_inference_steps=20, guidance_scale=7.5)
-    elif websearch:
-        response = web_search(user_prompt)
     else:
-        chat_completion = client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": user_prompt}
-            ],
-            model=MODEL,
-        )
-        response = chat_completion.choices[0].message.content
     return response
-@spaces.GPU()
-def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False, websearch=False):
-    text_model.to(device='cuda', dtype=torch.bfloat16)
-    tts_model.to("cuda")
-    unet.to("cuda", torch.float16)
-    image_pipe.to("cuda")
-    response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc, websearch=websearch)
-    if voice_only:
-        audio_file = play_voice_output(response)
-        return response, audio_file  # Return both text and audio outputs
-    else:
-        return response, None  # Return only the text output, no audio
 # Gradio UI Setup
 def create_ui():
     with gr.Blocks() as demo:
@@ -158,28 +173,27 @@ def create_ui():
                 user_prompt = gr.Textbox(placeholder="Type your message here...", lines=1)
             with gr.Column(scale=1):
                 image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
-                video_input = gr.Video(label="Upload a video", elem_id="video-icon")
                 audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
                 doc_input = gr.File(type="filepath", label="Upload a document", elem_id="document-icon")
                 voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
                 websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
             with gr.Column(scale=1):
                 submit = gr.Button("Submit")
         output_label = gr.Label(label="Output")
         audio_output = gr.Audio(label="Audio Output", visible=False)
         submit.click(
             fn=main_interface,
-            inputs=[user_prompt, image_input, video_input, audio_input, doc_input, voice_only_mode, websearch_mode],
-            outputs=[output_label, audio_output]  # Expecting a string and audio file
         )
         # Voice-only mode UI
         voice_only_mode.change(
             lambda x: gr.update(visible=not x),
             inputs=voice_only_mode,
-            outputs=[user_prompt, image_input, video_input, doc_input, websearch_mode, submit]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=x),
@@ -189,6 +203,22 @@ def create_ui():
     return demo
 # Launch the app
 demo = create_ui()
-demo.launch(inline=False)

 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
+from langchain_community.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
+from langchain.agents import initialize_agent, Tool
+from langchain.llms import OpenAI
 from PIL import Image
 from decord import VideoReader, cpu
 from tavily import TavilyClient
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
+# Initialize models and clients
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
+vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
                                        device_map="auto", torch_dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
 tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
+# Image generation model
 base = "stabilityai/stable-diffusion-xl-base-1.0"
 repo = "ByteDance/SDXL-Lightning"
 ckpt = "sdxl_lightning_4step_unet.safetensors"
 unet = UNet2DConditionModel.from_config(base, subfolder="unet").to("cuda", torch.float16)
 unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
+image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant="fp16")
 image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
+# Tavily Client for web search
+tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
+# Function to play voice output
 def play_voice_output(response):
     description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
     input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
     sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
     return "output.wav"
+# NumPy Code Calculator Tool
+def numpy_code_calculator(query):
+    """Generates and executes NumPy code for mathematical operations."""
     try:
+        # You might need to use a more sophisticated approach to generate NumPy code
+        # based on the user's query. This is a simple example.
+        llm_response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "user", "content": f"Write NumPy code to: {query}"}
+            ]
+        )
+        code = llm_response.choices[0].message.content
+        print(f"Generated NumPy code:\n{code}")  # Print the generated code
+        # Execute the code in a safe environment
+        local_dict = {"np": np}
+        exec(code, local_dict)
         result = local_dict.get("result", "No result found")
         return str(result)
     except Exception as e:
+        return f"Error: {e}"
+# Web Search Tool
+def web_search(query):
+    """Performs a web search using Tavily."""
+    answer = tavily_client.qna_search(query=query)
+    return answer
+# Image Generation Tool
+def image_generation(query):
+    """Generates an image based on the given prompt."""
+    image = image_pipe(prompt=query, num_inference_steps=20, guidance_scale=7.5).images[0]
+    image.save("output.jpg")
+    return "output.jpg"
+# Document Question Answering Tool
+def doc_question_answering(query, file_path):
+    """Answers questions based on the content of a document."""
+    with open(file_path, 'r') as f:
+        file_content = f.read()
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     docs = text_splitter.create_documents([file_content])
     embeddings = OpenAIEmbeddings()
+    db = Chroma.from_documents(docs, embeddings, persist_directory=".chroma_db")
     qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever())
     return qa.run(query)
+# Function to handle different input types and choose the right tool
 def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, websearch=False):
     # Voice input handling
     if audio:
         )
         user_prompt = transcription.text
+    # Initialize tools
+    tools = [
+        Tool(
+            name="Numpy Code Calculator",
+            func=numpy_code_calculator,
+            description="Useful for when you need to perform mathematical calculations using NumPy. Provide the calculation you want to perform.",
+        ),
+        Tool(
+            name="Web Search",
+            func=web_search,
+            description="Useful for when you need to find information from the real world.",
+        ),
+        Tool(
+            name="Image Generation",
+            func=image_generation,
+            description="Useful for when you need to generate an image based on a description.",
+        ),
+    ]
+    # Add document Q&A tool if a document is provided
+    if doc:
+        tools.append(
+            Tool(
+                name="Document Question Answering",
+                func=lambda query: doc_question_answering(query, doc.name),
+                description="Useful for when you need to answer questions about the uploaded document.",
+            )
+        )
+    # Initialize agent
+    agent = initialize_agent(
+        tools,
+        client,
+        agent="zero-shot-react-description",
+        verbose=True,
+    )
     # If user uploaded an image and text, use MiniCPM model
     if image:
         image = Image.open(image).convert('RGB')
         messages = [{"role": "user", "content": [image, user_prompt]}]
+        response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
         return response
+    # Use the agent to determine the best tool and get the response
+    if websearch:
+        response = agent.run(f"{user_prompt} Use the Web Search tool if necessary.")
     else:
+        response = agent.run(user_prompt)
     return response
 # Gradio UI Setup
 def create_ui():
     with gr.Blocks() as demo:
                 user_prompt = gr.Textbox(placeholder="Type your message here...", lines=1)
             with gr.Column(scale=1):
                 image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
                 audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
                 doc_input = gr.File(type="filepath", label="Upload a document", elem_id="document-icon")
                 voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
                 websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
             with gr.Column(scale=1):
                 submit = gr.Button("Submit")
         output_label = gr.Label(label="Output")
         audio_output = gr.Audio(label="Audio Output", visible=False)
         submit.click(
             fn=main_interface,
+            inputs=[user_prompt, image_input, audio_input, doc_input, voice_only_mode, websearch_mode],
+            outputs=[output_label, audio_output]
         )
         # Voice-only mode UI
         voice_only_mode.change(
             lambda x: gr.update(visible=not x),
             inputs=voice_only_mode,
+            outputs=[user_prompt, image_input, doc_input, websearch_mode, submit]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=x),
     return demo
+# Main interface function
+@spaces.GPU()
+def main_interface(user_prompt, image=None, audio=None, doc=None, voice_only=False, websearch=False):
+    vqa_model.to(device='cuda', dtype=torch.bfloat16)
+    tts_model.to("cuda")
+    unet.to("cuda", torch.float16)
+    image_pipe.to("cuda")
+    response = handle_input(user_prompt, image=image, audio=audio, doc=doc, websearch=websearch)
+    if voice_only:
+        audio_file = play_voice_output(response)
+        return response, audio_file
+    else:
+        return response, None
 # Launch the app
 demo = create_ui()
+demo.launch(inline=False)