Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Runtime error

App Files Files Community

VanguardAI commited on Aug 17, 2024

Commit

e7e0762

verified ·

1 Parent(s): dff714c

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -137

app.py CHANGED Viewed

@@ -8,21 +8,16 @@ from transformers import AutoModel, AutoTokenizer
 from diffusers import StableDiffusion3Pipeline
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
-from langchain.agents import AgentExecutor, create_react_agent, initialize_agent, Tool
-from langchain.agents import AgentType
 from langchain_groq import ChatGroq
-from langchain.prompts import PromptTemplate
 from PIL import Image
 from tavily import TavilyClient
-import requests
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
 from langchain.schema import AIMessage
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains import RetrievalQA
 # Initialize models and clients
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
@@ -53,54 +48,46 @@ def play_voice_output(response):
     sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
     return "output.wav"
-# NumPy Code Calculator Tool
-class NumpyCodeCalculator(Tool):
-    name = "Calculator"
-    description = "Useful only for performing numerical computations, not for general searches"
-    def _run(self, query: str) -> str:
-        print("Executing NumpyCodeCalculator tool")
-        try:
-            local_dict = {"np": np}
-            exec(query, local_dict)
-            result = local_dict.get("result", "No result found")
-            return str(result)
-        except Exception as e:
-            return f"Error: {e}"
-# Web Search Tool
-class WebSearch(Tool):
-    name = "Web"
-    description = "Useful for advanced web searching beyond general information"
-    def _run(self, query: str) -> str:
-        print("Executing WebSearch tool")
-        answer = tavily_client.qna_search(query=query)
-        return answer
-# Image Generation Tool
-class ImageGeneration(Tool):
-    name = "Image"
-    description = "Useful for generating images based on text descriptions"
-    def _run(self, query: str) -> str:
-        print("Executing ImageGeneration tool")
-        image = pipe(
-            query,
-            negative_prompt="",
-            num_inference_steps=15,
-            guidance_scale=7.0,
-        ).images[0]
-        image.save("output.jpg")
-        return "output.jpg"
-# Document Question Answering Tool
-class DocumentQuestionAnswering(Tool):
-    name = "Document"
-    description = "Useful for answering questions about a specific document"
     def __init__(self, document):
-        super().__init__()
         self.document = document
         self.qa_chain = self._setup_qa_chain()
@@ -120,79 +107,94 @@ class DocumentQuestionAnswering(Tool):
         )
         return qa_chain
-    def _run(self, query: str) -> str:
         print("Executing DocumentQuestionAnswering tool")
         response = self.qa_chain.run(query)
         return str(response)
-# Function to handle different input types and choose the right tool
 def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
     print(f"Handling input: {user_prompt}")
     # Initialize the LLM
     llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
-    # Define the tools
-    tools = []
-    # Add Image Generation Tool
-    tools.append(ImageGeneration())
-    # Add Calculator Tool
-    tools.append(NumpyCodeCalculator())
-    # Add Web Search Tool if enabled
-    if websearch:
-        tools.append(WebSearch())
-    # Add Document QA Tool if document is provided
-    if document:
-        tools.append(DocumentQuestionAnswering(document))
-    # Check if any tools are mentioned in the user prompt
-    requires_tool = any([tool.name.lower() in user_prompt.lower() for tool in tools])
-    # Handle different input scenarios
-    if image:
-        print("Processing image input")
-        image = Image.open(image).convert('RGB')
-        messages = [{"role": "user", "content": [image, user_prompt]}]
-        response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
-    elif audio:
         print("Processing audio input")
         transcription = client.audio.transcriptions.create(
             file=(audio.name, audio.read()),
             model="whisper-large-v3"
         )
         user_prompt = transcription.text
-        # If tools are required, use an agent
-        if requires_tool:
-            agent = initialize_agent(
-                tools,
-                llm,
-                agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
-                verbose=True
-            )
-            response = agent.run(user_prompt)
         else:
-            response = llm.call(query=user_prompt)
-    elif requires_tool:
-        print("Using agent with tools")
-        agent = initialize_agent(
-            tools,
-            llm,
-            agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
-            verbose=True
-        )
-        response = agent.run(user_prompt)
-    else:
-        print("Using LLM directly")
         response = llm.call(query=user_prompt)
     return response
 def create_ui():
     with gr.Blocks(css="""
         /* Overall Styling */
@@ -403,40 +405,6 @@ def create_ui():
     return demo
-# Main interface function
-@spaces.GPU(duration=720)
-def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
-    print("Starting main_interface function")
-    vqa_model.to(device='cuda', dtype=torch.bfloat16)
-    tts_model.to("cuda")
-    pipe.to("cuda")
-    print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
-    try:
-        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
-        print("handle_input function executed successfully")
-    except Exception as e:
-        print(f"Error in handle_input: {e}")
-        response = "Error occurred during processing."
-    if voice_only:
-        try:
-            transcription = client.audio.transcriptions.create(
-                file=("input.wav", open("input.wav", "rb").read()),
-                model="whisper-large-v3"
-            )
-            user_prompt = transcription.text
-            response = handle_input(user_prompt)
-            audio_output = play_voice_output(response)
-            print("play_voice_output function executed successfully")
-            return "Response generated.", audio_output
-        except Exception as e:
-            print(f"Error in play_voice_output: {e}")
-            return "Error occurred during voice output.", None
-    else:
-        return response, None
 # Launch the UI
 demo = create_ui()
 demo.launch()

 from diffusers import StableDiffusion3Pipeline
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
 from langchain_groq import ChatGroq
 from PIL import Image
 from tavily import TavilyClient
 from langchain.schema import AIMessage
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains import RetrievalQA
+import json
 # Initialize models and clients
 MODEL = 'llama3-groq-70b-8192-tool-use-preview'
     sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
     return "output.wav"
+# Function to classify user input using LLM
+def classify_function(user_prompt):
+    prompt = f"""
+    You are a function classifier AI assistant. You are given a user input and you need to classify it into one of the following functions:
+    - `image_generation`: If the user wants to generate an image.
+    - `image_description`: If the user wants to describe an image.
+    - `document_summarization`: If the user wants to summarize a document.
+    - `text_to_text`: If the user wants a text-based response.
+    Respond with a JSON object containing only the chosen function. For example:
+    ```json
+    {{"function": "image_generation"}}
+    ```
+    User input: {user_prompt}
+    """
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="llama3-8b-8192",
+    )
+    try:
+        response = json.loads(chat_completion.choices[0].message.content)
+        function = response.get("function")
+        return function
+    except json.JSONDecodeError:
+        print(f"Error decoding JSON: {chat_completion.choices[0].message.content}")
+        return "text_to_text"  # Default to text-to-text if JSON parsing fails
+# Document Question Answering Tool
+class DocumentQuestionAnswering:
     def __init__(self, document):
         self.document = document
         self.qa_chain = self._setup_qa_chain()
         )
         return qa_chain
+    def run(self, query: str) -> str:
         print("Executing DocumentQuestionAnswering tool")
         response = self.qa_chain.run(query)
         return str(response)
+# Function to handle different input types and choose the right pipeline
 def handle_input(user_prompt, image=None, audio=None, websearch=False, document=None):
     print(f"Handling input: {user_prompt}")
     # Initialize the LLM
     llm = ChatGroq(model=MODEL, api_key=os.environ.get("GROQ_API_KEY"))
+    # Handle voice-only mode
+    if audio:
         print("Processing audio input")
         transcription = client.audio.transcriptions.create(
             file=(audio.name, audio.read()),
             model="whisper-large-v3"
         )
         user_prompt = transcription.text
+        response = llm.call(query=user_prompt)
+        audio_output = play_voice_output(response)
+        return "Response generated.", audio_output
+    # Handle websearch mode
+    if websearch:
+        print("Executing Web Search")
+        answer = tavily_client.qna_search(query=user_prompt)
+        return answer, None
+    # Classify user input using LLM
+    function = classify_function(user_prompt)
+    # Handle different functions
+    if function == "image_generation":
+        print("Executing Image Generation")
+        image = pipe(
+            user_prompt,
+            negative_prompt="",
+            num_inference_steps=15,
+            guidance_scale=7.0,
+        ).images[0]
+        image.save("output.jpg")
+        return "output.jpg", None
+    elif function == "image_description":
+        print("Executing Image Description")
+        if image:
+            image = Image.open(image).convert('RGB')
+            messages = [{"role": "user", "content": [image, user_prompt]}]
+            response = vqa_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
+            return response, None
         else:
+            return "Please upload an image.", None
+    elif function == "document_summarization":
+        print("Executing Document Summarization")
+        if document:
+            document_qa = DocumentQuestionAnswering(document)
+            response = document_qa.run(user_prompt)
+            return response, None
+        else:
+            return "Please upload a document.", None
+    else:  # function == "text_to_text"
+        print("Executing Text-to-Text")
         response = llm.call(query=user_prompt)
+        return response, None
+# Main interface function
+@spaces.GPU(duration=720)
+def main_interface(user_prompt, image=None, audio=None, voice_only=False, websearch=False, document=None):
+    print("Starting main_interface function")
+    vqa_model.to(device='cuda', dtype=torch.bfloat16)
+    tts_model.to("cuda")
+    pipe.to("cuda")
+    print(f"user_prompt: {user_prompt}, image: {image}, audio: {audio}, voice_only: {voice_only}, websearch: {websearch}, document: {document}")
+    try:
+        response = handle_input(user_prompt, image=image, audio=audio, websearch=websearch, document=document)
+        print("handle_input function executed successfully")
+    except Exception as e:
+        print(f"Error in handle_input: {e}")
+        response = "Error occurred during processing."
     return response
 def create_ui():
     with gr.Blocks(css="""
         /* Overall Styling */
     return demo
 # Launch the UI
 demo = create_ui()
 demo.launch()