Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Paused

App Files Files Community

VanguardAI commited on Aug 15, 2024

Commit

c7c3138

verified ·

1 Parent(s): 8318c4a

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -63

app.py CHANGED Viewed

@@ -8,10 +8,7 @@ from transformers import AutoModel, AutoTokenizer
 from diffusers import StableDiffusion3Pipeline
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
-from langchain.agents import AgentExecutor, create_react_agent
-from langchain.tools import BaseTool
-from langchain_groq import ChatGroq
-from langchain.agents import AgentExecutor, initialize_agent, Tool
 from langchain.agents import AgentType
 from langchain_groq import ChatGroq
 from langchain.prompts import PromptTemplate
@@ -56,9 +53,9 @@ def play_voice_output(response):
     return "output.wav"
 # NumPy Code Calculator Tool
-class NumpyCodeCalculator(BaseTool):
     name = "Numpy"
-    description = "Useful for performing numpy computations"
     def _run(self, query: str) -> str:
         try:
@@ -70,16 +67,16 @@ class NumpyCodeCalculator(BaseTool):
             return f"Error: {e}"
 # Web Search Tool
-class WebSearch(BaseTool):
     name = "Web"
-    description = "Useful for searching the web for information"
     def _run(self, query: str) -> str:
         answer = tavily_client.qna_search(query=query)
         return answer
 # Image Generation Tool
-class ImageGeneration(BaseTool):
     name = "Image"
     description = "Useful for generating images based on text descriptions"
@@ -94,7 +91,7 @@ class ImageGeneration(BaseTool):
         return "output.jpg"
 # Document Question Answering Tool
-class DocumentQuestionAnswering(BaseTool):
     name = "Document"
     description = "Useful for answering questions about a specific document"
@@ -122,8 +119,8 @@ class DocumentQuestionAnswering(BaseTool):
         response = self.qa_chain.run(query)
         return str(response)
-class DuckDuckGoSearchRun(BaseTool):
-    name = "DuckDuckGo"
     description = "Useful for searching the internet for general information"
     def _run(self, query: str) -> str:
@@ -136,75 +133,52 @@ class DuckDuckGoSearchRun(BaseTool):
         data = response.json()
         answer = data["Abstract"]
         return answer
-# Function to handle different input types and choose the right tool
 # Function to handle different input types and choose the right tool
-def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
-    # Initialize the search tool
-    search = DuckDuckGoSearchRun()
     tools = [
-        Tool(
-            name="Search",
-            func=search.run,
-            description="Useful for searching the internet for general information"
-        ),
-        Tool(
-            name="Image",
-            func=ImageGeneration()._run,
-            description="Useful for generating images based on text descriptions"
-        ),
     ]
-    # Add the numpy tool, but with a more specific description
-    tools.append(Tool(
-        name="Numpy",
-        func=NumpyCodeCalculator()._run,
-        description="Useful only for performing numerical computations, not for general searches"
-    ))
     # Add the web search tool only if websearch mode is enabled
     if websearch:
-        tools.append(Tool(
-            name="Web",
-            func=WebSearch()._run,
-            description="Useful for advanced web searching beyond general information"
-        ))
     # Add the document question answering tool only if a document is provided
     if document:
-        tools.append(Tool(
-            name="Document",
-            func=DocumentQuestionAnswering(document)._run,
-            description="Useful for answering questions about a specific document"
-        ))
-    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
     # Check if the input requires any tools
-    requires_tool = False
-    for tool in tools:
-        if tool.name.lower() in user_prompt.lower():
-            requires_tool = True
-            break
-    if image or audio or requires_tool:
-        # Initialize the agent
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
-        if image:
-            image = Image.open(image).convert('RGB')
-            messages = [{"role": "user", "content": [image, user_prompt]}]
-            response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
-        else:
-            response = agent.run(user_prompt)
     else:
-        # If no tools are required, use the LLM directly
         response = llm.call(query=user_prompt)
     return response
@@ -420,7 +394,6 @@ def create_ui():
     return demo
-# Main interface function
 @spaces.GPU(duration=180)
 def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
     print("Starting main_interface function")
@@ -431,7 +404,7 @@ def main_interface(user_prompt, image=None, audio=None, voice_only=False, websea
     print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
     try:
-        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")

 from diffusers import StableDiffusion3Pipeline
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
+from langchain.agents import AgentExecutor, create_react_agent, initialize_agent, Tool
 from langchain.agents import AgentType
 from langchain_groq import ChatGroq
 from langchain.prompts import PromptTemplate
     return "output.wav"
 # NumPy Code Calculator Tool
+class NumpyCodeCalculator(Tool):
     name = "Numpy"
+    description = "Useful only for performing numerical computations, not for general searches"
     def _run(self, query: str) -> str:
         try:
             return f"Error: {e}"
 # Web Search Tool
+class WebSearch(Tool):
     name = "Web"
+    description = "Useful for advanced web searching beyond general information"
     def _run(self, query: str) -> str:
         answer = tavily_client.qna_search(query=query)
         return answer
 # Image Generation Tool
+class ImageGeneration(Tool):
     name = "Image"
     description = "Useful for generating images based on text descriptions"
         return "output.jpg"
 # Document Question Answering Tool
+class DocumentQuestionAnswering(Tool):
     name = "Document"
     description = "Useful for answering questions about a specific document"
         response = self.qa_chain.run(query)
         return str(response)
+class DuckDuckGoSearchRun(Tool):
+    name = "Search"
     description = "Useful for searching the internet for general information"
     def _run(self, query: str) -> str:
         data = response.json()
         answer = data["Abstract"]
         return answer
 # Function to handle different input types and choose the right tool
+def handle_input(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
+    # Initialize the LLM
+    llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
+    # Initialize tools
     tools = [
+        DuckDuckGoSearchRun(),
+        ImageGeneration(),
+        NumpyCodeCalculator(),
     ]
     # Add the web search tool only if websearch mode is enabled
     if websearch:
+        tools.append(WebSearch())
     # Add the document question answering tool only if a document is provided
     if document:
+        tools.append(DocumentQuestionAnswering(document))
+    # Handle voice input
+    if voice_only and audio:
+        # TODO: Implement Whisper integration for voice-to-text
+        user_prompt = "Whisper transcription of audio" # Replace with actual transcription
+    # Handle image and text input
+    if image and user_prompt:
+        image = Image.open(image).convert('RGB')
+        messages = [{"role": "user", "content": [image, user_prompt]}]
+        response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
+        return response
     # Check if the input requires any tools
+    requires_tool = any(tool.name.lower() in user_prompt.lower() for tool in tools)
+    # Use agent if tools are required, otherwise use LLM directly
+    if requires_tool:
         agent = initialize_agent(
             tools,
             llm,
             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
             verbose=True
         )
+        response = agent.run(user_prompt)
     else:
         response = llm.call(query=user_prompt)
     return response
     return demo
 @spaces.GPU(duration=180)
 def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
     print("Starting main_interface function")
     print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
     try:
+        response = handle_input(user_prompt, image=image, audio=audio, voice_only=voice_only, websearch=websearch, document=document)
         print("handle_input function executed successfully")
     except Exception as e:
         print(f"Error in handle_input: {e}")