Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Runtime error

App Files Files Community

VanguardAI commited on Aug 14, 2024

Commit

df220f6

verified ·

1 Parent(s): d70fa2a

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -11

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import requests
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from llama_index.core.chat_engine.types import AgentChatResponse
 # Initialize models and clients
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
@@ -73,8 +74,15 @@ def image_generation(query):
     image.save("output.jpg")
     return "output.jpg"
 # Function to handle different input types and choose the right tool
-def handle_input(user_prompt, image=None, audio=None, websearch=False):
     if audio:
         if isinstance(audio, str):
             audio = open(audio, "rb")
@@ -88,11 +96,16 @@ def handle_input(user_prompt, image=None, audio=None, websearch=False):
         FunctionTool.from_defaults(fn=numpy_code_calculator, name="Numpy"),
         FunctionTool.from_defaults(fn=image_generation, name="Image"),
     ]
     # Add the web search tool only if websearch mode is enabled
     if websearch:
         tools.append(FunctionTool.from_defaults(fn=web_search, name="Web"))
     llm = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
     agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
@@ -102,11 +115,11 @@ def handle_input(user_prompt, image=None, audio=None, websearch=False):
         response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
     else:
         response = agent.chat(user_prompt)
     # Extract the content from AgentChatResponse to return as a string
     if isinstance(response, AgentChatResponse):
         response = response.response
     return response
@@ -120,6 +133,7 @@ def create_ui():
             with gr.Column(scale=1):
                 image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
                 audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
                 voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
                 websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
             with gr.Column(scale=1):
@@ -130,14 +144,14 @@ def create_ui():
         submit.click(
             fn=main_interface,
-            inputs=[user_prompt, image_input, audio_input, voice_only_mode, websearch_mode],
             outputs=[output_label, audio_output]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=not x),
             inputs=voice_only_mode,
-            outputs=[user_prompt, image_input, websearch_mode, submit]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=x),
@@ -149,16 +163,16 @@ def create_ui():
 # Main interface function
 @spaces.GPU()
-def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False):
     print("Starting main_interface function")
     vqa_model.to(device='cuda', dtype=torch.bfloat16)
     tts_model.to("cuda")
     pipe.to("cuda")
-    print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}")
     try:
-        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")
@@ -178,4 +192,4 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
 # Launch the UI
 demo = create_ui()
-demo.launch()

 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from llama_index.core.chat_engine.types import AgentChatResponse
+from llama_index.core import VectorStoreIndex
 # Initialize models and clients
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
     image.save("output.jpg")
     return "output.jpg"
+# Document Question Answering Tool
+def document_question_answering(query, docs):
+    index = VectorStoreIndex.from_documents(docs)
+    query_engine = index.as_query_engine(similarity_top_k=3)
+    response = query_engine.query(query)
+    return str(response)
 # Function to handle different input types and choose the right tool
+def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
     if audio:
         if isinstance(audio, str):
             audio = open(audio, "rb")
         FunctionTool.from_defaults(fn=numpy_code_calculator, name="Numpy"),
         FunctionTool.from_defaults(fn=image_generation, name="Image"),
     ]
     # Add the web search tool only if websearch mode is enabled
     if websearch:
         tools.append(FunctionTool.from_defaults(fn=web_search, name="Web"))
+    # Add the document question answering tool only if a document is provided
+    if document:
+        docs = LlamaParse(result_type="text").load_data(document)
+        tools.append(FunctionTool.from_defaults(fn=document_question_answering, name="Document", docs=docs))
     llm = Groq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
     agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
         response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
     else:
         response = agent.chat(user_prompt)
     # Extract the content from AgentChatResponse to return as a string
     if isinstance(response, AgentChatResponse):
         response = response.response
     return response
             with gr.Column(scale=1):
                 image_input = gr.Image(type="filepath", label="Upload an image", elem_id="image-icon")
                 audio_input = gr.Audio(type="filepath", label="Upload audio", elem_id="mic-icon")
+                document_input = gr.File(type="file", label="Upload a document", elem_id="document-icon")
                 voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode", elem_id="voice-only-mode")
                 websearch_mode = gr.Checkbox(label="Enable Web Search", elem_id="websearch-mode")
             with gr.Column(scale=1):
         submit.click(
             fn=main_interface,
+            inputs=[user_prompt, image_input, audio_input, voice_only_mode, websearch_mode, document_input],
             outputs=[output_label, audio_output]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=not x),
             inputs=voice_only_mode,
+            outputs=[user_prompt, image_input, websearch_mode, document_input, submit]
         )
         voice_only_mode.change(
             lambda x: gr.update(visible=x),
 # Main interface function
 @spaces.GPU()
+def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
     print("Starting main_interface function")
     vqa_model.to(device='cuda', dtype=torch.bfloat16)
     tts_model.to("cuda")
     pipe.to("cuda")
+    print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
     try:
+        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")
 # Launch the UI
 demo = create_ui()
+demo.launch()