VanguardAI commited on
Commit
c1e9d2a
·
verified ·
1 Parent(s): a62642f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -18
app.py CHANGED
@@ -3,7 +3,7 @@ import torch
3
  import os
4
  import numpy as np
5
  from groq import Groq
6
- import spaces
7
  from transformers import AutoModel, AutoTokenizer
8
  from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
9
  from parler_tts import ParlerTTSForConditionalGeneration
@@ -11,9 +11,9 @@ import soundfile as sf
11
  from langchain_community.embeddings import OpenAIEmbeddings
12
  from langchain_community.vectorstores import Chroma
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
- from langchain.chains import RetrievalQA
15
- from langchain.agents import initialize_agent, Tool
16
- from langchain.llms import OpenAI
17
  from PIL import Image
18
  from decord import VideoReader, cpu
19
  from tavily import TavilyClient
@@ -24,6 +24,7 @@ from safetensors.torch import load_file
24
  # Initialize models and clients
25
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
26
  MODEL = 'llama3-groq-70b-8192-tool-use-preview'
 
27
 
28
  vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
29
  device_map="auto", torch_dtype=torch.bfloat16)
@@ -43,7 +44,7 @@ image_pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dt
43
  image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
44
 
45
  # Tavily Client for web search
46
- tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API"))
47
 
48
  # Function to play voice output
49
  def play_voice_output(response):
@@ -59,8 +60,6 @@ def play_voice_output(response):
59
  def numpy_code_calculator(query):
60
  """Generates and executes NumPy code for mathematical operations."""
61
  try:
62
- # You might need to use a more sophisticated approach to generate NumPy code
63
- # based on the user's query. This is a simple example.
64
  llm_response = client.chat.completions.create(
65
  model=MODEL,
66
  messages=[
@@ -68,7 +67,7 @@ def numpy_code_calculator(query):
68
  ]
69
  )
70
  code = llm_response.choices[0].message.content
71
- print(f"Generated NumPy code:\n{code}") # Print the generated code
72
 
73
  # Execute the code in a safe environment
74
  local_dict = {"np": np}
@@ -104,9 +103,12 @@ def doc_question_answering(query, file_path):
104
  return qa.run(query)
105
 
106
  # Function to handle different input types and choose the right tool
107
- def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, websearch=False):
108
  # Voice input handling
109
  if audio:
 
 
 
110
  transcription = client.audio.transcriptions.create(
111
  file=(audio.name, audio.read()),
112
  model="whisper-large-v3"
@@ -143,12 +145,8 @@ def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, webs
143
  )
144
 
145
  # Initialize agent
146
- agent = initialize_agent(
147
- tools,
148
- client,
149
- agent="zero-shot-react-description",
150
- verbose=True,
151
- )
152
 
153
  # If user uploaded an image and text, use MiniCPM model
154
  if image:
@@ -159,9 +157,9 @@ def handle_input(user_prompt, image=None, video=None, audio=None, doc=None, webs
159
 
160
  # Use the agent to determine the best tool and get the response
161
  if websearch:
162
- response = agent.run(f"{user_prompt} Use the Web Search tool if necessary.")
163
  else:
164
- response = agent.run(user_prompt)
165
 
166
  return response
167
 
@@ -209,7 +207,7 @@ def create_ui():
209
  def main_interface(user_prompt, image=None, audio=None, doc=None, voice_only=False, websearch=False):
210
  vqa_model.to(device='cuda', dtype=torch.bfloat16)
211
  tts_model.to("cuda")
212
- unet.to("cuda", torch.float16)
213
  image_pipe.to("cuda")
214
 
215
  response = handle_input(user_prompt, image=image, audio=audio, doc=doc, websearch=websearch)
 
3
  import os
4
  import numpy as np
5
  from groq import Groq
6
+ import spaces # Import spaces
7
  from transformers import AutoModel, AutoTokenizer
8
  from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
9
  from parler_tts import ParlerTTSForConditionalGeneration
 
11
  from langchain_community.embeddings import OpenAIEmbeddings
12
  from langchain_community.vectorstores import Chroma
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from langchain.chains import RetrievalQA, LLMChain
15
+ from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
16
+ from langchain.llms import Groq as GroqLlm # Import GroqLlm
17
  from PIL import Image
18
  from decord import VideoReader, cpu
19
  from tavily import TavilyClient
 
24
  # Initialize models and clients
25
  client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
26
  MODEL = 'llama3-groq-70b-8192-tool-use-preview'
27
+ llm = GroqLlm(client=client, model=MODEL) # Initialize GroqLlm
28
 
29
  vqa_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
30
  device_map="auto", torch_dtype=torch.bfloat16)
 
44
  image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
45
 
46
  # Tavily Client for web search
47
+ tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY")) # Corrected API key
48
 
49
  # Function to play voice output
50
  def play_voice_output(response):
 
60
  def numpy_code_calculator(query):
61
  """Generates and executes NumPy code for mathematical operations."""
62
  try:
 
 
63
  llm_response = client.chat.completions.create(
64
  model=MODEL,
65
  messages=[
 
67
  ]
68
  )
69
  code = llm_response.choices[0].message.content
70
+ print(f"Generated NumPy code:\n{code}")
71
 
72
  # Execute the code in a safe environment
73
  local_dict = {"np": np}
 
103
  return qa.run(query)
104
 
105
  # Function to handle different input types and choose the right tool
106
+ def handle_input(user_prompt, image=None, audio=None, doc=None, websearch=False):
107
  # Voice input handling
108
  if audio:
109
+ # Make sure 'audio' is a file object
110
+ if isinstance(audio, str):
111
+ audio = open(audio, "rb")
112
  transcription = client.audio.transcriptions.create(
113
  file=(audio.name, audio.read()),
114
  model="whisper-large-v3"
 
145
  )
146
 
147
  # Initialize agent
148
+ agent = ZeroShotAgent(llm_chain=LLMChain(llm=llm, prompt=None), tools=tools, verbose=True)
149
+ agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True)
 
 
 
 
150
 
151
  # If user uploaded an image and text, use MiniCPM model
152
  if image:
 
157
 
158
  # Use the agent to determine the best tool and get the response
159
  if websearch:
160
+ response = agent_executor.run(f"{user_prompt} Use the Web Search tool if necessary.")
161
  else:
162
+ response = agent_executor.run(user_prompt)
163
 
164
  return response
165
 
 
207
  def main_interface(user_prompt, image=None, audio=None, doc=None, voice_only=False, websearch=False):
208
  vqa_model.to(device='cuda', dtype=torch.bfloat16)
209
  tts_model.to("cuda")
210
+ unet.to("cuda")
211
  image_pipe.to("cuda")
212
 
213
  response = handle_input(user_prompt, image=image, audio=audio, doc=doc, websearch=websearch)