laverdes commited on
Commit
fae2aeb
·
1 Parent(s): ba86979

chore: app update, authorized imports

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. Gradio_UI.py +108 -1
  3. agent.json +1 -2
  4. app.py +19 -2
  5. requirements.txt +16 -3
.gitignore CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ tools/unassigned.py
3
+ /todos_and_alternatives.py
Gradio_UI.py CHANGED
@@ -13,16 +13,22 @@
13
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
 
16
  import mimetypes
17
  import os
18
  import re
19
  import shutil
20
  from typing import Optional
21
 
 
22
  from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
23
- from smolagents.agents import ActionStep, MultiStepAgent
24
  from smolagents.memory import MemoryStep
25
  from smolagents.utils import _is_package_available
 
 
 
 
26
 
27
 
28
  def pull_messages_from_step(
@@ -123,6 +129,93 @@ def pull_messages_from_step(
123
  yield gr.ChatMessage(role="assistant", content="-----")
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def stream_to_gradio(
127
  agent,
128
  task: str,
@@ -151,6 +244,20 @@ def stream_to_gradio(
151
  for message in pull_messages_from_step(
152
  step_log,
153
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  yield message
155
 
156
  final_answer = step_log # Last log is the run's final_answer
 
13
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
  # See the License for the specific language governing permissions and
15
  # limitations under the License.
16
+ import asyncio
17
  import mimetypes
18
  import os
19
  import re
20
  import shutil
21
  from typing import Optional
22
 
23
+ import torch
24
  from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
25
+ from smolagents.agents import ActionStep, MultiStepAgent, PlanningStep, TaskStep
26
  from smolagents.memory import MemoryStep
27
  from smolagents.utils import _is_package_available
28
+ from transformers import pipeline, AutoTokenizer
29
+
30
+ import numpy as np
31
+ import sounddevice as sd
32
 
33
 
34
  def pull_messages_from_step(
 
129
  yield gr.ChatMessage(role="assistant", content="-----")
130
 
131
 
132
+ text_to_speech_pipe = pipeline(
133
+ task="text-to-speech",
134
+ model="facebook/fastspeech2-en-ljspeech", # "suno/bark-small",
135
+ device = 0 if torch.cuda.is_available() else "cpu",
136
+ torch_dtype=torch.float16,
137
+ )
138
+ tokenizer = AutoTokenizer.from_pretrained("suno/bark-small")
139
+ #print("suno/bark-small tokenizer pad_token_id: ", tokenizer.pad_token_id) # 0
140
+ #print("suno/bark-small tokenizer eos_token_id: ", tokenizer.eos_token_id) # none
141
+ text_to_speech_pipe.model.pad_token_id = tokenizer.pad_token_id
142
+ text_to_speech_pipe.model.eos_token_id = tokenizer.eos_token_id
143
+
144
+
145
+ async def play_audio_async(audio_data, sampling_rate):
146
+ loop = asyncio.get_event_loop()
147
+ await loop.run_in_executor(None, sd.play, audio_data, sampling_rate)
148
+
149
+
150
+ async def speech_to_text(text_in):
151
+ text = f"[clears throat] {text_in}"
152
+ # attention_mask = [1] * len(text.split())
153
+ output = text_to_speech_pipe(text)
154
+ # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
155
+ audio = np.array(output["audio"]) #, dtype=np.float16)
156
+ print("Original audio shape:", audio.shape)
157
+
158
+ # Adjust audio shape if necessary:
159
+ if audio.ndim == 1:
160
+ # Mono audio, should be fine. You can check if your device expects stereo.
161
+ print("Mono audio... should be fine. You can check if your device expects stereo.")
162
+ elif audio.ndim == 2:
163
+ # Check if the number of channels is acceptable (e.g., 1 or 2)
164
+ channels = audio.shape[1]
165
+ if channels not in [1, 2]:
166
+ # Try to squeeze extra dimensions
167
+ audio = np.squeeze(audio)
168
+ print("Squeezed audio shape:", audio.shape)
169
+ else:
170
+ # If audio has more dimensions than expected, flatten or reshape as needed
171
+ audio = np.squeeze(audio)
172
+ print("Squeezed audio shape:", audio.shape)
173
+
174
+ # Play the audio using sounddevice
175
+ try:
176
+ # sd.play(audio, output["sampling_rate"])
177
+ # sd.wait() # Wait until audio playback is complete
178
+ await play_audio_async(audio, output["sampling_rate"])
179
+ except Exception as e:
180
+ print(f"Error playing audio: {e}")
181
+
182
+ return True
183
+
184
+
185
+ def sync_speech_to_text(text_in):
186
+ text = f"[clears throat] {text_in}"
187
+ # attention_mask = [1] * len(text.split())
188
+ output = text_to_speech_pipe(text)
189
+ # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
190
+ audio = np.array(output["audio"], dtype=np.float32)
191
+ print("Original audio shape:", audio.shape)
192
+
193
+ # Adjust audio shape if necessary:
194
+ if audio.ndim == 1:
195
+ # Mono audio, should be fine. You can check if your device expects stereo.
196
+ print("Mono audio... should be fine. You can check if your device expects stereo.")
197
+ elif audio.ndim == 2:
198
+ # Check if the number of channels is acceptable (e.g., 1 or 2)
199
+ channels = audio.shape[1]
200
+ if channels not in [1, 2]:
201
+ # Try to squeeze extra dimensions
202
+ audio = np.squeeze(audio)
203
+ print("Squeezed audio shape:", audio.shape)
204
+ else:
205
+ # If audio has more dimensions than expected, flatten or reshape as needed
206
+ audio = np.squeeze(audio)
207
+ print("Squeezed audio shape:", audio.shape)
208
+
209
+ # Play the audio using sounddevice
210
+ try:
211
+ sd.play(audio, output["sampling_rate"])
212
+ sd.wait() # Wait until audio playback is complete
213
+ except Exception as e:
214
+ print(f"Error playing audio: {e}")
215
+
216
+ return True
217
+
218
+
219
  def stream_to_gradio(
220
  agent,
221
  task: str,
 
244
  for message in pull_messages_from_step(
245
  step_log,
246
  ):
247
+ """
248
+ if isinstance(step_log, ActionStep):
249
+ speach = message.content + ". This was an Action Step."
250
+ if isinstance(step_log, PlanningStep):
251
+ speach = message.content + ". We are done with the Planning Step."
252
+ if isinstance(step_log, TaskStep):
253
+ speach = message.content + ". This is a Task Step."
254
+ if isinstance(step_log, MemoryStep):
255
+ speach = "Memory Step: " + message.content
256
+ """
257
+ #if "Thought:" in message.content and isinstance(step_log, MemoryStep):
258
+ # asyncio.run(speech_to_text(f"[clears throat] I am thinking that {message.content.replace('Thought:', '')}"))
259
+ # sync_speech_to_text(f"[clears throat] I am thinking that {message.content.replace('Thought:', '')}")
260
+ #pass
261
  yield message
262
 
263
  final_answer = step_log # Last log is the run's final_answer
agent.json CHANGED
@@ -47,7 +47,6 @@
47
  "statistics",
48
  "queue",
49
  "time",
50
- "collections",
51
- "re"
52
  ]
53
  }
 
47
  "statistics",
48
  "queue",
49
  "time",
50
+ "collections"
 
51
  ]
52
  }
app.py CHANGED
@@ -24,7 +24,6 @@ from datetime import datetime
24
  from skimage import io
25
  from PIL import Image
26
  from typing import Optional, Tuple
27
- from IPython.display import Audio, display
28
 
29
  from opentelemetry.sdk.trace import TracerProvider
30
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
@@ -40,6 +39,7 @@ from langchain_openai import OpenAI
40
  from io import BytesIO
41
  from time import sleep
42
 
 
43
  from smolagents.agents import ActionStep
44
  from smolagents.cli import load_model
45
  from smolagents import (
@@ -59,6 +59,8 @@ from smolagents import (
59
  # load .env vars
60
  load_dotenv()
61
 
 
 
62
  # fast prototyping tools
63
  @tool
64
  def get_current_time_in_timezone(timezone: str) -> str:
@@ -203,10 +205,24 @@ text_to_speech_pipe = pipeline(
203
  )
204
  text_to_speech_pipe.model.enable_cpu_offload()
205
  text_to_speech_pipe.model.use_flash_attention_2=True
 
 
 
 
 
 
 
 
 
 
206
 
207
  def speech_to_text(final_answer_text, agent_memory):
208
- text = f"[clears throat] {final_answer_text}"
 
 
 
209
  output = text_to_speech_pipe(text)
 
210
  # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
211
  audio = np.array(output["audio"], dtype=np.float32)
212
  print("Original audio shape:", audio.shape)
@@ -333,6 +349,7 @@ agent = CodeAgent(
333
  "numpy",
334
  "requests",
335
  "helium",
 
336
  ],
337
  # I could also add the authorized_imports from a LIST_SAFE_MODULES
338
  )
 
24
  from skimage import io
25
  from PIL import Image
26
  from typing import Optional, Tuple
 
27
 
28
  from opentelemetry.sdk.trace import TracerProvider
29
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 
39
  from io import BytesIO
40
  from time import sleep
41
 
42
+ from smolagents.utils import BASE_BUILTIN_MODULES
43
  from smolagents.agents import ActionStep
44
  from smolagents.cli import load_model
45
  from smolagents import (
 
59
  # load .env vars
60
  load_dotenv()
61
 
62
+ BASE_BUILTIN_MODULES.remove("re")
63
+
64
  # fast prototyping tools
65
  @tool
66
  def get_current_time_in_timezone(timezone: str) -> str:
 
205
  )
206
  text_to_speech_pipe.model.enable_cpu_offload()
207
  text_to_speech_pipe.model.use_flash_attention_2=True
208
+ text_to_speech_pipe.model.pad_token_id=0 # 50257
209
+
210
+ from transformers import AutoTokenizer
211
+
212
+ tokenizer = AutoTokenizer.from_pretrained("suno/bark-small")
213
+ #print("suno/bark-small tokenizer pad_token_id: ", tokenizer.pad_token_id) # 0
214
+ #print("suno/bark-small tokenizer eos_token_id: ", tokenizer.eos_token_id) # none
215
+ text_to_speech_pipe.model.pad_token_id = tokenizer.pad_token_id
216
+ text_to_speech_pipe.model.eos_token_id = tokenizer.eos_token_id
217
+
218
 
219
  def speech_to_text(final_answer_text, agent_memory):
220
+ text = f"[clears throat] Here is the final answer: {final_answer_text}"
221
+ # attention_mask = [1] * len(text.split()) # Create an attention mask for your text
222
+
223
+ # Run the pipeline with the attention mask
224
  output = text_to_speech_pipe(text)
225
+
226
  # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
227
  audio = np.array(output["audio"], dtype=np.float32)
228
  print("Original audio shape:", audio.shape)
 
349
  "numpy",
350
  "requests",
351
  "helium",
352
+ "bs4"
353
  ],
354
  # I could also add the authorized_imports from a LIST_SAFE_MODULES
355
  )
requirements.txt CHANGED
@@ -4,18 +4,31 @@ requests
4
  duckduckgo_search
5
  pandas
6
  transformers
7
- # transformers[agents]
8
  torch
 
 
 
 
 
 
 
9
  langchain
10
  openai
11
- # accelerate
12
  langchain-community
13
  google-search-results
14
  smolagents[transformers]
 
 
15
  scikit-image
 
 
16
  pycountry
17
  opentelemetry-sdk
18
  opentelemetry-exporter-otlp
19
  openinference-instrumentation-smolagents
 
 
20
  python-dotenv
21
- langchain-openai
 
4
  duckduckgo_search
5
  pandas
6
  transformers
7
+ rank_bm25
8
  torch
9
+ torchvision
10
+ torchaudio
11
+ optimum
12
+ accelerate
13
+ beautifulsoup4
14
+ sounddevice
15
+ optimum
16
  langchain
17
  openai
18
+ datasets
19
  langchain-community
20
  google-search-results
21
  smolagents[transformers]
22
+ smoloagents[e2b]
23
+ smolagents[litellm]
24
  scikit-image
25
+ pillow
26
+ duckduckgo-search
27
  pycountry
28
  opentelemetry-sdk
29
  opentelemetry-exporter-otlp
30
  openinference-instrumentation-smolagents
31
+ helium
32
+ selenium
33
  python-dotenv
34
+ langchain-openai