chore: speech-to-text for final answer (slow)
Browse files- app.py +59 -4
- tools/rag_transformers.py +55 -0
app.py
CHANGED
@@ -2,10 +2,13 @@ import os
|
|
2 |
import base64
|
3 |
import math
|
4 |
import pytz
|
|
|
5 |
import yaml
|
6 |
import pycountry
|
7 |
import subprocess
|
8 |
import sys
|
|
|
|
|
9 |
|
10 |
from tools.final_answer import FinalAnswerTool
|
11 |
from tools.visit_webpage import VisitWebpageTool
|
@@ -21,6 +24,7 @@ from datetime import datetime
|
|
21 |
from skimage import io
|
22 |
from PIL import Image
|
23 |
from typing import Optional, Tuple
|
|
|
24 |
|
25 |
from opentelemetry.sdk.trace import TracerProvider
|
26 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
@@ -190,7 +194,47 @@ def browser_automation(original_user_query:str)->str:
|
|
190 |
print("vision_web_browser.py: ", result.stderr)
|
191 |
return result.stdout
|
192 |
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
def initialize_langfuse_opentelemetry_instrumentation():
|
195 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
196 |
LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
|
@@ -204,6 +248,8 @@ def initialize_langfuse_opentelemetry_instrumentation():
|
|
204 |
|
205 |
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
206 |
|
|
|
|
|
207 |
initialize_langfuse_opentelemetry_instrumentation()
|
208 |
|
209 |
# load tools from /tools/
|
@@ -232,7 +278,15 @@ image_generation_tool_fast = Tool.from_space(
|
|
232 |
)
|
233 |
|
234 |
|
235 |
-
ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
with open("prompts.yaml", 'r') as stream:
|
238 |
prompt_templates = yaml.safe_load(stream)
|
@@ -260,13 +314,14 @@ agent = CodeAgent(
|
|
260 |
max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
|
261 |
verbosity_level=2,
|
262 |
grammar=None,
|
263 |
-
planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
|
264 |
name="Alfredo",
|
265 |
description="CEO",
|
266 |
prompt_templates=prompt_templates,
|
267 |
# executor_type="e2b", # security, could also be "docker" (set keys)
|
268 |
# sandbox=E2BSandbox() (or E2BExecutor?),
|
269 |
-
# step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and
|
|
|
270 |
additional_authorized_imports=[
|
271 |
"geopandas",
|
272 |
"plotly",
|
|
|
2 |
import base64
|
3 |
import math
|
4 |
import pytz
|
5 |
+
import torch
|
6 |
import yaml
|
7 |
import pycountry
|
8 |
import subprocess
|
9 |
import sys
|
10 |
+
import numpy as np
|
11 |
+
import sounddevice as sd
|
12 |
|
13 |
from tools.final_answer import FinalAnswerTool
|
14 |
from tools.visit_webpage import VisitWebpageTool
|
|
|
24 |
from skimage import io
|
25 |
from PIL import Image
|
26 |
from typing import Optional, Tuple
|
27 |
+
from IPython.display import Audio, display
|
28 |
|
29 |
from opentelemetry.sdk.trace import TracerProvider
|
30 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
|
|
194 |
print("vision_web_browser.py: ", result.stderr)
|
195 |
return result.stdout
|
196 |
|
197 |
+
print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
|
198 |
+
text_to_speech_pipe = pipeline(
|
199 |
+
"text-to-speech",
|
200 |
+
model="suno/bark-small",
|
201 |
+
device = 0 if torch.cuda.is_available() else "cpu",
|
202 |
+
)
|
203 |
+
|
204 |
+
|
205 |
+
def speech_to_text(final_answer_text, agent_memory):
|
206 |
+
text = f"[clears throat] {final_answer_text}"
|
207 |
+
output = text_to_speech_pipe(text)
|
208 |
+
# display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
|
209 |
+
audio = np.array(output["audio"], dtype=np.float32)
|
210 |
+
print("Original audio shape:", audio.shape)
|
211 |
+
|
212 |
+
# Adjust audio shape if necessary:
|
213 |
+
if audio.ndim == 1:
|
214 |
+
# Mono audio, should be fine. You can check if your device expects stereo.
|
215 |
+
print("Mono audio... should be fine. You can check if your device expects stereo.")
|
216 |
+
elif audio.ndim == 2:
|
217 |
+
# Check if the number of channels is acceptable (e.g., 1 or 2)
|
218 |
+
channels = audio.shape[1]
|
219 |
+
if channels not in [1, 2]:
|
220 |
+
# Try to squeeze extra dimensions
|
221 |
+
audio = np.squeeze(audio)
|
222 |
+
print("Squeezed audio shape:", audio.shape)
|
223 |
+
else:
|
224 |
+
# If audio has more dimensions than expected, flatten or reshape as needed
|
225 |
+
audio = np.squeeze(audio)
|
226 |
+
print("Squeezed audio shape:", audio.shape)
|
227 |
+
|
228 |
+
# Play the audio using sounddevice
|
229 |
+
try:
|
230 |
+
sd.play(audio, output["sampling_rate"])
|
231 |
+
sd.wait() # Wait until audio playback is complete
|
232 |
+
except Exception as e:
|
233 |
+
print(f"Error playing audio: {e}")
|
234 |
+
|
235 |
+
return True
|
236 |
+
|
237 |
+
|
238 |
def initialize_langfuse_opentelemetry_instrumentation():
|
239 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
240 |
LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
|
|
|
248 |
|
249 |
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
250 |
|
251 |
+
|
252 |
+
# telemetry
|
253 |
initialize_langfuse_opentelemetry_instrumentation()
|
254 |
|
255 |
# load tools from /tools/
|
|
|
278 |
)
|
279 |
|
280 |
|
281 |
+
# ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
|
282 |
+
|
283 |
+
|
284 |
+
ceo_model = HfApiModel(
|
285 |
+
max_tokens=2096, # 8096 for manager
|
286 |
+
temperature=0.5,
|
287 |
+
model_id= 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # "meta-llama/Llama-3.3-70B-Instruct", # 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct
|
288 |
+
custom_role_conversions=None,
|
289 |
+
)
|
290 |
|
291 |
with open("prompts.yaml", 'r') as stream:
|
292 |
prompt_templates = yaml.safe_load(stream)
|
|
|
314 |
max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
|
315 |
verbosity_level=2,
|
316 |
grammar=None,
|
317 |
+
# planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager) # test for crashing issues.
|
318 |
name="Alfredo",
|
319 |
description="CEO",
|
320 |
prompt_templates=prompt_templates,
|
321 |
# executor_type="e2b", # security, could also be "docker" (set keys)
|
322 |
# sandbox=E2BSandbox() (or E2BExecutor?),
|
323 |
+
# step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and manage it with alfred
|
324 |
+
final_answer_checks=[speech_to_text],
|
325 |
additional_authorized_imports=[
|
326 |
"geopandas",
|
327 |
"plotly",
|
tools/rag_transformers.py
CHANGED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datasets
|
2 |
+
from langchain.docstore.document import Document
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.retrievers import BM25Retriever
|
5 |
+
from smolagents import Tool
|
6 |
+
|
7 |
+
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
|
8 |
+
knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers"))
|
9 |
+
|
10 |
+
source_docs = [
|
11 |
+
Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]})
|
12 |
+
for doc in knowledge_base
|
13 |
+
]
|
14 |
+
|
15 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
16 |
+
chunk_size=500,
|
17 |
+
chunk_overlap=50,
|
18 |
+
add_start_index=True,
|
19 |
+
strip_whitespace=True,
|
20 |
+
separators=["\n\n", "\n", ".", " ", ""],
|
21 |
+
)
|
22 |
+
docs_processed = text_splitter.split_documents(source_docs)
|
23 |
+
|
24 |
+
|
25 |
+
class TransformersRetrieverTool(Tool):
|
26 |
+
name = "transformers_retriever"
|
27 |
+
description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
|
28 |
+
inputs = {
|
29 |
+
"query": {
|
30 |
+
"type": "string",
|
31 |
+
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
|
32 |
+
}
|
33 |
+
}
|
34 |
+
output_type = "string"
|
35 |
+
|
36 |
+
def __init__(self, docs, **kwargs):
|
37 |
+
super().__init__(**kwargs)
|
38 |
+
self.retriever = BM25Retriever.from_documents(
|
39 |
+
docs, k=10
|
40 |
+
)
|
41 |
+
|
42 |
+
def forward(self, query: str) -> str:
|
43 |
+
assert isinstance(query, str), "Your search query must be a string"
|
44 |
+
|
45 |
+
docs = self.retriever.invoke(
|
46 |
+
query,
|
47 |
+
)
|
48 |
+
return "\nRetrieved documents:\n" + "".join(
|
49 |
+
[
|
50 |
+
f"\n\n===== Document {str(i)} =====\n" + doc.page_content
|
51 |
+
for i, doc in enumerate(docs)
|
52 |
+
]
|
53 |
+
)
|
54 |
+
|
55 |
+
retriever_tool = TransformersRetrieverTool(docs_processed)
|