Alfredo

Runtime error

App Files Files Community

laverdes commited on Apr 7

Commit

fae2aeb

1 Parent(s): ba86979

chore: app update, authorized imports

Browse files

Files changed (5) hide show

.gitignore +3 -0
Gradio_UI.py +108 -1
agent.json +1 -2
app.py +19 -2
requirements.txt +16 -3

.gitignore CHANGED Viewed

	@@ -0,0 +1,3 @@

+.env
+tools/unassigned.py
+/todos_and_alternatives.py

Gradio_UI.py CHANGED Viewed

@@ -13,16 +13,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import mimetypes
 import os
 import re
 import shutil
 from typing import Optional
 from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
-from smolagents.agents import ActionStep, MultiStepAgent
 from smolagents.memory import MemoryStep
 from smolagents.utils import _is_package_available
 def pull_messages_from_step(
@@ -123,6 +129,93 @@ def pull_messages_from_step(
         yield gr.ChatMessage(role="assistant", content="-----")
 def stream_to_gradio(
     agent,
     task: str,
@@ -151,6 +244,20 @@ def stream_to_gradio(
         for message in pull_messages_from_step(
             step_log,
         ):
             yield message
     final_answer = step_log  # Last log is the run's final_answer

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import mimetypes
 import os
 import re
 import shutil
 from typing import Optional
+import torch
 from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
+from smolagents.agents import ActionStep, MultiStepAgent, PlanningStep, TaskStep
 from smolagents.memory import MemoryStep
 from smolagents.utils import _is_package_available
+from transformers import pipeline, AutoTokenizer
+import numpy as np
+import sounddevice as sd
 def pull_messages_from_step(
         yield gr.ChatMessage(role="assistant", content="-----")
+text_to_speech_pipe = pipeline(
+    task="text-to-speech",
+    model="facebook/fastspeech2-en-ljspeech",  #  "suno/bark-small",
+    device = 0 if torch.cuda.is_available() else "cpu",
+    torch_dtype=torch.float16,
+    )
+tokenizer = AutoTokenizer.from_pretrained("suno/bark-small")
+#print("suno/bark-small tokenizer pad_token_id: ", tokenizer.pad_token_id)  # 0
+#print("suno/bark-small tokenizer eos_token_id: ", tokenizer.eos_token_id)  # none
+text_to_speech_pipe.model.pad_token_id = tokenizer.pad_token_id
+text_to_speech_pipe.model.eos_token_id = tokenizer.eos_token_id
+async def play_audio_async(audio_data, sampling_rate):
+    loop = asyncio.get_event_loop()
+    await loop.run_in_executor(None, sd.play, audio_data, sampling_rate)
+async def speech_to_text(text_in):
+    text = f"[clears throat] {text_in}"
+    # attention_mask = [1] * len(text.split())
+    output = text_to_speech_pipe(text)
+    # display(Audio(output["audio"], rate=output["sampling_rate"]))  # notebook
+    audio = np.array(output["audio"]) #, dtype=np.float16)
+    print("Original audio shape:", audio.shape)
+    # Adjust audio shape if necessary:
+    if audio.ndim == 1:
+        # Mono audio, should be fine. You can check if your device expects stereo.
+        print("Mono audio... should be fine. You can check if your device expects stereo.")
+    elif audio.ndim == 2:
+        # Check if the number of channels is acceptable (e.g., 1 or 2)
+        channels = audio.shape[1]
+        if channels not in [1, 2]:
+            # Try to squeeze extra dimensions
+            audio = np.squeeze(audio)
+            print("Squeezed audio shape:", audio.shape)
+    else:
+        # If audio has more dimensions than expected, flatten or reshape as needed
+        audio = np.squeeze(audio)
+        print("Squeezed audio shape:", audio.shape)
+    # Play the audio using sounddevice
+    try:
+        # sd.play(audio, output["sampling_rate"])
+        # sd.wait()  # Wait until audio playback is complete
+        await play_audio_async(audio, output["sampling_rate"])
+    except Exception as e:
+        print(f"Error playing audio: {e}")
+    return True
+def sync_speech_to_text(text_in):
+    text = f"[clears throat] {text_in}"
+    # attention_mask = [1] * len(text.split())
+    output = text_to_speech_pipe(text)
+    # display(Audio(output["audio"], rate=output["sampling_rate"]))  # notebook
+    audio = np.array(output["audio"], dtype=np.float32)
+    print("Original audio shape:", audio.shape)
+    # Adjust audio shape if necessary:
+    if audio.ndim == 1:
+        # Mono audio, should be fine. You can check if your device expects stereo.
+        print("Mono audio... should be fine. You can check if your device expects stereo.")
+    elif audio.ndim == 2:
+        # Check if the number of channels is acceptable (e.g., 1 or 2)
+        channels = audio.shape[1]
+        if channels not in [1, 2]:
+            # Try to squeeze extra dimensions
+            audio = np.squeeze(audio)
+            print("Squeezed audio shape:", audio.shape)
+    else:
+        # If audio has more dimensions than expected, flatten or reshape as needed
+        audio = np.squeeze(audio)
+        print("Squeezed audio shape:", audio.shape)
+    # Play the audio using sounddevice
+    try:
+        sd.play(audio, output["sampling_rate"])
+        sd.wait()  # Wait until audio playback is complete
+    except Exception as e:
+        print(f"Error playing audio: {e}")
+    return True
 def stream_to_gradio(
     agent,
     task: str,
         for message in pull_messages_from_step(
             step_log,
         ):
+            """
+            if isinstance(step_log, ActionStep):
+                speach = message.content + ". This was an Action Step."
+            if isinstance(step_log, PlanningStep):
+                speach = message.content + ". We are done with the Planning Step."
+            if isinstance(step_log, TaskStep):
+                speach = message.content + ". This is a Task Step."
+            if isinstance(step_log, MemoryStep):
+                speach = "Memory Step: " + message.content
+            """
+            #if "Thought:" in message.content and isinstance(step_log, MemoryStep):
+                # asyncio.run(speech_to_text(f"[clears throat] I am thinking that {message.content.replace('Thought:', '')}"))
+                # sync_speech_to_text(f"[clears throat] I am thinking that {message.content.replace('Thought:', '')}")
+                #pass
             yield message
     final_answer = step_log  # Last log is the run's final_answer

agent.json CHANGED Viewed

@@ -47,7 +47,6 @@
         "statistics",
         "queue",
         "time",
-        "collections",
-        "re"
     ]
 }

         "statistics",
         "queue",
         "time",
+        "collections"
     ]
 }

app.py CHANGED Viewed

@@ -24,7 +24,6 @@ from datetime import datetime
 from skimage import io
 from PIL import Image
 from typing import Optional, Tuple
-from IPython.display import Audio, display
 from opentelemetry.sdk.trace import TracerProvider
 from openinference.instrumentation.smolagents import SmolagentsInstrumentor
@@ -40,6 +39,7 @@ from langchain_openai import OpenAI
 from io import BytesIO
 from time import sleep
 from smolagents.agents import ActionStep
 from smolagents.cli import load_model
 from smolagents import (
@@ -59,6 +59,8 @@ from smolagents import (
 # load .env vars
 load_dotenv()
 # fast prototyping tools
 @tool
 def get_current_time_in_timezone(timezone: str) -> str:
@@ -203,10 +205,24 @@ text_to_speech_pipe = pipeline(
     )
 text_to_speech_pipe.model.enable_cpu_offload()
 text_to_speech_pipe.model.use_flash_attention_2=True
 def speech_to_text(final_answer_text, agent_memory):
-    text = f"[clears throat] {final_answer_text}"
     output = text_to_speech_pipe(text)
     # display(Audio(output["audio"], rate=output["sampling_rate"]))  # notebook
     audio = np.array(output["audio"], dtype=np.float32)
     print("Original audio shape:", audio.shape)
@@ -333,6 +349,7 @@ agent = CodeAgent(
         "numpy",
         "requests",
         "helium",
     ],
     # I could also add the authorized_imports from a LIST_SAFE_MODULES
 )

 from skimage import io
 from PIL import Image
 from typing import Optional, Tuple
 from opentelemetry.sdk.trace import TracerProvider
 from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 from io import BytesIO
 from time import sleep
+from smolagents.utils import BASE_BUILTIN_MODULES
 from smolagents.agents import ActionStep
 from smolagents.cli import load_model
 from smolagents import (
 # load .env vars
 load_dotenv()
+BASE_BUILTIN_MODULES.remove("re")
 # fast prototyping tools
 @tool
 def get_current_time_in_timezone(timezone: str) -> str:
     )
 text_to_speech_pipe.model.enable_cpu_offload()
 text_to_speech_pipe.model.use_flash_attention_2=True
+text_to_speech_pipe.model.pad_token_id=0  # 50257
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("suno/bark-small")
+#print("suno/bark-small tokenizer pad_token_id: ", tokenizer.pad_token_id)  # 0
+#print("suno/bark-small tokenizer eos_token_id: ", tokenizer.eos_token_id)  # none
+text_to_speech_pipe.model.pad_token_id = tokenizer.pad_token_id
+text_to_speech_pipe.model.eos_token_id = tokenizer.eos_token_id
 def speech_to_text(final_answer_text, agent_memory):
+    text = f"[clears throat] Here is the final answer: {final_answer_text}"
+    # attention_mask = [1] * len(text.split())  # Create an attention mask for your text
+    # Run the pipeline with the attention mask
     output = text_to_speech_pipe(text)
     # display(Audio(output["audio"], rate=output["sampling_rate"]))  # notebook
     audio = np.array(output["audio"], dtype=np.float32)
     print("Original audio shape:", audio.shape)
         "numpy",
         "requests",
         "helium",
+        "bs4"
     ],
     # I could also add the authorized_imports from a LIST_SAFE_MODULES
 )

requirements.txt CHANGED Viewed

@@ -4,18 +4,31 @@ requests
 duckduckgo_search
 pandas
 transformers
-# transformers[agents]
 torch
 langchain
 openai
-# accelerate
 langchain-community
 google-search-results
 smolagents[transformers]
 scikit-image
 pycountry
 opentelemetry-sdk
 opentelemetry-exporter-otlp
 openinference-instrumentation-smolagents
 python-dotenv
-langchain-openai

 duckduckgo_search
 pandas
 transformers
+rank_bm25
 torch
+torchvision
+torchaudio
+optimum
+accelerate
+beautifulsoup4
+sounddevice
+optimum
 langchain
 openai
+datasets
 langchain-community
 google-search-results
 smolagents[transformers]
+smoloagents[e2b]
+smolagents[litellm]
 scikit-image
+pillow
+duckduckgo-search
 pycountry
 opentelemetry-sdk
 opentelemetry-exporter-otlp
 openinference-instrumentation-smolagents
+helium
+selenium
 python-dotenv
+langchain-openai