laverdes commited on
Commit
79762d1
·
1 Parent(s): 84ac217

chore: speech-to-text for final answer (slow)

Browse files
Files changed (2) hide show
  1. app.py +59 -4
  2. tools/rag_transformers.py +55 -0
app.py CHANGED
@@ -2,10 +2,13 @@ import os
2
  import base64
3
  import math
4
  import pytz
 
5
  import yaml
6
  import pycountry
7
  import subprocess
8
  import sys
 
 
9
 
10
  from tools.final_answer import FinalAnswerTool
11
  from tools.visit_webpage import VisitWebpageTool
@@ -21,6 +24,7 @@ from datetime import datetime
21
  from skimage import io
22
  from PIL import Image
23
  from typing import Optional, Tuple
 
24
 
25
  from opentelemetry.sdk.trace import TracerProvider
26
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
@@ -190,7 +194,47 @@ def browser_automation(original_user_query:str)->str:
190
  print("vision_web_browser.py: ", result.stderr)
191
  return result.stdout
192
 
193
- # telemetry
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def initialize_langfuse_opentelemetry_instrumentation():
195
  LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
196
  LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
@@ -204,6 +248,8 @@ def initialize_langfuse_opentelemetry_instrumentation():
204
 
205
  SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
206
 
 
 
207
  initialize_langfuse_opentelemetry_instrumentation()
208
 
209
  # load tools from /tools/
@@ -232,7 +278,15 @@ image_generation_tool_fast = Tool.from_space(
232
  )
233
 
234
 
235
- ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
 
 
 
 
 
 
 
 
236
 
237
  with open("prompts.yaml", 'r') as stream:
238
  prompt_templates = yaml.safe_load(stream)
@@ -260,13 +314,14 @@ agent = CodeAgent(
260
  max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
261
  verbosity_level=2,
262
  grammar=None,
263
- planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
264
  name="Alfredo",
265
  description="CEO",
266
  prompt_templates=prompt_templates,
267
  # executor_type="e2b", # security, could also be "docker" (set keys)
268
  # sandbox=E2BSandbox() (or E2BExecutor?),
269
- # step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and mangage it with alfred
 
270
  additional_authorized_imports=[
271
  "geopandas",
272
  "plotly",
 
2
  import base64
3
  import math
4
  import pytz
5
+ import torch
6
  import yaml
7
  import pycountry
8
  import subprocess
9
  import sys
10
+ import numpy as np
11
+ import sounddevice as sd
12
 
13
  from tools.final_answer import FinalAnswerTool
14
  from tools.visit_webpage import VisitWebpageTool
 
24
  from skimage import io
25
  from PIL import Image
26
  from typing import Optional, Tuple
27
+ from IPython.display import Audio, display
28
 
29
  from opentelemetry.sdk.trace import TracerProvider
30
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 
194
  print("vision_web_browser.py: ", result.stderr)
195
  return result.stdout
196
 
197
+ print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
198
+ text_to_speech_pipe = pipeline(
199
+ "text-to-speech",
200
+ model="suno/bark-small",
201
+ device = 0 if torch.cuda.is_available() else "cpu",
202
+ )
203
+
204
+
205
+ def speech_to_text(final_answer_text, agent_memory):
206
+ text = f"[clears throat] {final_answer_text}"
207
+ output = text_to_speech_pipe(text)
208
+ # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
209
+ audio = np.array(output["audio"], dtype=np.float32)
210
+ print("Original audio shape:", audio.shape)
211
+
212
+ # Adjust audio shape if necessary:
213
+ if audio.ndim == 1:
214
+ # Mono audio, should be fine. You can check if your device expects stereo.
215
+ print("Mono audio... should be fine. You can check if your device expects stereo.")
216
+ elif audio.ndim == 2:
217
+ # Check if the number of channels is acceptable (e.g., 1 or 2)
218
+ channels = audio.shape[1]
219
+ if channels not in [1, 2]:
220
+ # Try to squeeze extra dimensions
221
+ audio = np.squeeze(audio)
222
+ print("Squeezed audio shape:", audio.shape)
223
+ else:
224
+ # If audio has more dimensions than expected, flatten or reshape as needed
225
+ audio = np.squeeze(audio)
226
+ print("Squeezed audio shape:", audio.shape)
227
+
228
+ # Play the audio using sounddevice
229
+ try:
230
+ sd.play(audio, output["sampling_rate"])
231
+ sd.wait() # Wait until audio playback is complete
232
+ except Exception as e:
233
+ print(f"Error playing audio: {e}")
234
+
235
+ return True
236
+
237
+
238
  def initialize_langfuse_opentelemetry_instrumentation():
239
  LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
240
  LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
 
248
 
249
  SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
250
 
251
+
252
+ # telemetry
253
  initialize_langfuse_opentelemetry_instrumentation()
254
 
255
  # load tools from /tools/
 
278
  )
279
 
280
 
281
+ # ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
282
+
283
+
284
+ ceo_model = HfApiModel(
285
+ max_tokens=2096, # 8096 for manager
286
+ temperature=0.5,
287
+ model_id= 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # "meta-llama/Llama-3.3-70B-Instruct", # 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct
288
+ custom_role_conversions=None,
289
+ )
290
 
291
  with open("prompts.yaml", 'r') as stream:
292
  prompt_templates = yaml.safe_load(stream)
 
314
  max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
315
  verbosity_level=2,
316
  grammar=None,
317
+ # planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager) # test for crashing issues.
318
  name="Alfredo",
319
  description="CEO",
320
  prompt_templates=prompt_templates,
321
  # executor_type="e2b", # security, could also be "docker" (set keys)
322
  # sandbox=E2BSandbox() (or E2BExecutor?),
323
+ # step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and manage it with alfred
324
+ final_answer_checks=[speech_to_text],
325
  additional_authorized_imports=[
326
  "geopandas",
327
  "plotly",
tools/rag_transformers.py CHANGED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ from langchain.docstore.document import Document
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.retrievers import BM25Retriever
5
+ from smolagents import Tool
6
+
7
+ knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
8
+ knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers"))
9
+
10
+ source_docs = [
11
+ Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]})
12
+ for doc in knowledge_base
13
+ ]
14
+
15
+ text_splitter = RecursiveCharacterTextSplitter(
16
+ chunk_size=500,
17
+ chunk_overlap=50,
18
+ add_start_index=True,
19
+ strip_whitespace=True,
20
+ separators=["\n\n", "\n", ".", " ", ""],
21
+ )
22
+ docs_processed = text_splitter.split_documents(source_docs)
23
+
24
+
25
+ class TransformersRetrieverTool(Tool):
26
+ name = "transformers_retriever"
27
+ description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
28
+ inputs = {
29
+ "query": {
30
+ "type": "string",
31
+ "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
32
+ }
33
+ }
34
+ output_type = "string"
35
+
36
+ def __init__(self, docs, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.retriever = BM25Retriever.from_documents(
39
+ docs, k=10
40
+ )
41
+
42
+ def forward(self, query: str) -> str:
43
+ assert isinstance(query, str), "Your search query must be a string"
44
+
45
+ docs = self.retriever.invoke(
46
+ query,
47
+ )
48
+ return "\nRetrieved documents:\n" + "".join(
49
+ [
50
+ f"\n\n===== Document {str(i)} =====\n" + doc.page_content
51
+ for i, doc in enumerate(docs)
52
+ ]
53
+ )
54
+
55
+ retriever_tool = TransformersRetrieverTool(docs_processed)